From c7b42e41d897fe1f928b29ae070bfcba2b147277 Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Tue, 5 May 2026 09:30:57 -0400
Subject: [PATCH] fix(dev): make `npm run dev` serve full question content from
 local YAMLs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this change, the StaffML Next.js dev server fetched scenario and
details (including napkin_math) from the production Cloudflare Worker
even when contributors had local YAML edits — so changes weren't visible
without shipping. The opt-in static-fallback path existed but was wired
incorrectly: getStaticFullDetail used a Function-constructor dynamic
import of ../data/corpus.json, which Turbopack rewrote to a non-existent
/_next/static/data/corpus.json URL and 404'd at runtime.

Fix in three parts:

1. Loader (interviews/staffml/src/lib/corpus.ts): replace the broken
   dynamic import with fetch('/data/corpus.json'). On failure, throw a
   clear error pointing at `vault build --local`.

2. Build (interviews/vault-cli/src/vault_cli/commands/build.py): mirror
   the generated corpus.json into interviews/staffml/public/data/ so
   Next serves it as a static asset. Add --local as a clearer alias for
   --local-json and update the help text to spell out the dev workflow.

3. Wiring (interviews/staffml/package.json + scripts/build-local-corpus.mjs):
   predev now runs `vault build --local` automatically, with a soft-fail
   path if the vault CLI isn't installed (so first-time contributors
   still get a working dev server, just with the worker fallback). The
   committed .env.development sets NEXT_PUBLIC_VAULT_FALLBACK=static so
   the static path is the default in dev. Both copies of corpus.json are
   gitignored as build artifacts (the YAMLs are the source of truth).
---
 interviews/staffml/.env.development           | 13 +++++
 interviews/staffml/.gitignore                 |  7 +++
 interviews/staffml/package.json               |  5 +-
 .../staffml/scripts/build-local-corpus.mjs    | 55 +++++++++++++++++++
 .../staffml/src/data/corpus-summary.json      |  2 +-
 .../staffml/src/data/vault-manifest.json      | 28 +++++-----
 interviews/staffml/src/lib/corpus.ts          | 26 +++++----
 .../vault-cli/src/vault_cli/commands/build.py | 27 +++++++--
 8 files changed, 129 insertions(+), 34 deletions(-)
 create mode 100644 interviews/staffml/.env.development
 create mode 100644 interviews/staffml/scripts/build-local-corpus.mjs

diff --git a/interviews/staffml/.env.development b/interviews/staffml/.env.development
new file mode 100644
index 000000000..a7b12cfea
--- /dev/null
+++ b/interviews/staffml/.env.development
@@ -0,0 +1,13 @@
+# Local-dev environment for `npm run dev`.
+#
+# This file is .gitignored by Next.js convention (.env.development.local) and
+# auto-loaded only by `next dev`. It opts the dev server into the local-static
+# fallback, which makes it serve question scenario/details from
+# public/data/corpus.json (regenerated on each `npm run dev` by the predev
+# hook running `vault build --local`).
+#
+# Without this var, the dev server falls back to the production Cloudflare
+# Worker for question content, which means local YAML edits aren't visible
+# until they ship — exactly the gotcha this file exists to prevent.
+
+NEXT_PUBLIC_VAULT_FALLBACK=static
diff --git a/interviews/staffml/.gitignore b/interviews/staffml/.gitignore
index 6314d978a..fd93e4099 100644
--- a/interviews/staffml/.gitignore
+++ b/interviews/staffml/.gitignore
@@ -15,6 +15,13 @@ tsconfig.tsbuildinfo
 # is the source of truth; don't commit the copy.
 public/question-visuals/
 
+# Build output — `vault build --local` materializes the full corpus from
+# vault/questions/*.yaml. Both copies are regenerated on every `npm run dev`
+# by the predev hook, so they're build artifacts, not source. The vault
+# YAMLs are the source of truth.
+src/data/corpus.json
+public/data/corpus.json
+
 # Playwright test artifacts (screenshots, videos, traces on failure).
 # The `tests/` directory itself IS committed.
 test-results/
diff --git a/interviews/staffml/package.json b/interviews/staffml/package.json
index 1795c497a..1f4f33b57 100644
--- a/interviews/staffml/package.json
+++ b/interviews/staffml/package.json
@@ -3,7 +3,7 @@
   "version": "0.0.1-dev",
   "private": true,
   "scripts": {
-    "predev": "node scripts/sync-periodic-table.mjs",
+    "predev": "node scripts/sync-periodic-table.mjs && node scripts/build-local-corpus.mjs",
     "dev": "next dev",
     "prebuild": "node scripts/sync-periodic-table.mjs",
     "build": "next build",
@@ -11,7 +11,8 @@
     "lint": "next lint",
     "test": "vitest run",
     "test:watch": "vitest",
-    "sync:periodic-table": "node scripts/sync-periodic-table.mjs"
+    "sync:periodic-table": "node scripts/sync-periodic-table.mjs",
+    "build:local-corpus": "node scripts/build-local-corpus.mjs"
   },
   "dependencies": {
     "@react-sigma/core": "^5.0.6",
diff --git a/interviews/staffml/scripts/build-local-corpus.mjs b/interviews/staffml/scripts/build-local-corpus.mjs
new file mode 100644
index 000000000..40145c86a
--- /dev/null
+++ b/interviews/staffml/scripts/build-local-corpus.mjs
@@ -0,0 +1,55 @@
+#!/usr/bin/env node
+/**
+ * Auto-run before `npm run dev` so the local Next.js dev server can
+ * serve the question corpus from disk via NEXT_PUBLIC_VAULT_FALLBACK=static.
+ *
+ * What it does:
+ *   1. Looks for the `vault` CLI on PATH.
+ *   2. Runs `vault build --local` from the repo root, which writes:
+ *        interviews/staffml/src/data/corpus.json   (legacy bundle)
+ *        interviews/staffml/public/data/corpus.json (the path the loader fetches)
+ *      and mirrors visual SVGs into public/question-visuals/.
+ *
+ * Skipped silently if `vault` is not installed (e.g. on a fresh checkout
+ * that hasn't run `pip install -e interviews/vault-cli` yet). The dev
+ * server still boots; it just falls back to the production worker for
+ * scenario/details, which is the same behavior contributors got before
+ * this hook was wired in.
+ *
+ * Override: set STAFFML_SKIP_LOCAL_CORPUS=1 to bypass entirely.
+ */
+import { spawnSync } from "node:child_process";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+if (process.env.STAFFML_SKIP_LOCAL_CORPUS === "1") {
+  console.log("[build-local-corpus] STAFFML_SKIP_LOCAL_CORPUS=1, skipping");
+  process.exit(0);
+}
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const REPO_ROOT = path.resolve(__dirname, "..", "..", "..");
+
+const which = spawnSync("which", ["vault"], { encoding: "utf8" });
+if (which.status !== 0 || !which.stdout.trim()) {
+  console.log(
+    "[build-local-corpus] `vault` CLI not on PATH; skipping local corpus rebuild.\n" +
+    "  To enable full-content rendering against your local YAMLs, run:\n" +
+    "    pip install -e interviews/vault-cli\n" +
+    "  then re-run `npm run dev`."
+  );
+  process.exit(0);
+}
+
+console.log("[build-local-corpus] running `vault build --local` ...");
+const r = spawnSync("vault", ["build", "--local"], {
+  cwd: REPO_ROOT,
+  stdio: "inherit",
+});
+if (r.status !== 0) {
+  console.error("[build-local-corpus] vault build failed; dev server will fall back to the worker.");
+  // Soft-fail: don't block dev server startup just because the local corpus
+  // isn't available. The worker fallback still gives a usable site.
+  process.exit(0);
+}
+console.log("[build-local-corpus] done.");
diff --git a/interviews/staffml/src/data/corpus-summary.json b/interviews/staffml/src/data/corpus-summary.json
index 836a22144..3c66ed659 100644
--- a/interviews/staffml/src/data/corpus-summary.json
+++ b/interviews/staffml/src/data/corpus-summary.json
@@ -1 +1 @@
-[{"id": "cloud-0000", "title": "The Per-Token KV-Cache Cost", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much FP16 KV-cache memory does one added token consume for this 40-layer, 64-head, 128-dim-head model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 655 KB", "About 2.62 MB", "About 32.7 KB", "About 1.31 MB"], "correct_index": 3}}, {"id": "cloud-0001", "title": "The Continuous Batching Target", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following metrics is continuous batching designed to primarily improve?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 0}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time To First Token (TTFT)", "System Throughput / Average Time Per Output Token (TPOT)", "Model Loading Time", "GPU Idle Time"], "correct_index": 1}}, {"id": "cloud-0002", "title": "The KV-Cache Memory Hog", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate VRAM consumed by the KV cache for this single request?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 0}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.56 GiB", "~32 GiB", "~244 GiB", "~2.56 MiB"], "correct_index": 2}}, {"id": "cloud-0004", "title": "The VRAM Cost of Context", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you calculate the FP16 KV-cache VRAM for one 4,096-token request on this 80-layer, d_model=8192 model?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 1}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.25 GB", "5 GB", "10 GB", "80 GB"], "correct_index": 2}}, {"id": "cloud-0006", "title": "The Throughput Saturation Fallacy", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How long does it take one saturated node to serve 5,000 tokens at 3,000 tokens/s, and does user distribution change that?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Scenario A is slower due to multi-user overhead.", "Scenario B is slower because long generations are inefficient.", "Both scenarios take ~1.67 seconds to complete.", "Both scenarios take ~0.06 seconds to complete."], "correct_index": 2}}, {"id": "cloud-0007", "title": "The Network Tax: NVLink vs. InfiniBand", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a cross-rack data transfer using InfiniBand NDR compared to a GPU-to-GPU transfer within the same server using NVLink 4.0?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly the same latency (~1x)", "About 2x slower", "About 10x slower", "About 100x slower"], "correct_index": 2}}, {"id": "cloud-0008", "title": "The Speed of Light Constraint in RAG", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Ignoring compute, disk I/O, and queuing, what is the approximate minimum RTT for a single retrieval across the Atlantic?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~5 µs (Inter-rack trap)", "~1 ms (Regional zone trap)", "~40 ms", "~500 ms (Satellite trap)"], "correct_index": 2}}, {"id": "cloud-0009", "title": "The Blue/Green Memory Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak GPU memory is required to hold both old and new FP16 7B model weights during the blue/green rollout?", "chain_ids": ["cloud-chain-auto-001-02"], "chain_positions": {"cloud-chain-auto-001-02": 1}, "chain_tiers": {"cloud-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB. The new model replaces the old one, so you only need space for one.", "7 GB. A 7B model requires 7GB of memory.", "28 GB. Both the old and new 14 GB models must be in memory at the same time.", "56 GB. An FP16 model uses 4 bytes/param, and you need two of them."], "correct_index": 2}}, {"id": "cloud-0011", "title": "The Llama 3 KV Cache Footprint", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you calculate the FP16 KV-cache VRAM for one 8,192-token Llama 3 8B request, and how does using KV heads versus query heads change it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4 GiB", "0.5 GiB", "1 GiB", "2 GiB"], "correct_index": 2}}, {"id": "cloud-0012", "title": "The True Cost of Batching on TTFT", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What maximum batch size keeps TTFT under 250ms with a 100ms batching window and 15ms prefill per request?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 requests. The worst-case queueing time must be subtracted from the SLO before calculating batch capacity.", "16 requests. The SLO of 250ms can be divided directly by the 15ms per-request time.", "6 requests. This assumes the queue time (100ms) and processing time (150ms) are independent.", "It's unlimited. The H100 is fast enough that prefill time is negligible."], "correct_index": 0}}, {"id": "cloud-0013", "title": "The TPOT Memory Wall", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary hardware bottleneck for Time Per Output Token (TPOT), and can the GPU theoretically meet this user's expectation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the GPU is compute-bound during generation and is limited to ~15 tokens/sec.", "No, FP16 memory reads (140GB) limit the GPU to ~24 tokens/sec, just barely meeting the demand.", "Yes, the GPU's memory bandwidth supports a theoretical speed of ~95 tokens/sec, so the issue is likely in the software.", "Yes, because the KV-cache makes all subsequent token generation instantaneous."], "correct_index": 2}}, {"id": "cloud-0014", "title": "The RAG Retrieval Step-Cost", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate latency you should expect for a single, random read from this SSD to retrieve a document chunk?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 ns", "~5,000 ns (5 µs)", "~100,000 ns (100 µs)", "~40,000,000 ns (40 ms)"], "correct_index": 2}}, {"id": "cloud-0016", "title": "The Skew from the Disk", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Based on the fundamental physics of a computer, what is a primary suspect for this drop in performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVMe SSD read is roughly 10x slower than the memory access, which is a minor source of error.", "A subtle floating-point precision difference between the Python training and C++ serving environments.", "The NVMe SSD read is over 300x slower than HBM memory access, likely causing data to be unavailable at inference time.", "The CPU clock speed is dynamically throttled lower during inference, affecting numerical stability."], "correct_index": 2}}, {"id": "cloud-0018", "title": "The Static Batching Penalty", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What minimum TTFT delay does static batching add for a single request to an idle server with batch size 8 and a 100ms timeout?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0ms, because the server is idle and can process the request immediately.", "~5ms, the approximate time to generate a single token.", "100ms, because the server waits for the batching timeout to expire.", "800ms, calculated by multiplying the batch size by the timeout."], "correct_index": 2}}, {"id": "cloud-0019", "title": "Continuous Batching and TPOT", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the effective TPOT for the new user in the batch?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 1}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100ms, because the 20ms step time is multiplied by the 5 users in the batch.", "4ms, because the 20ms step time is divided by the 5 users in the batch.", "20ms, because one token is generated for all users in a single step.", "33ms, because the system will throttle to match the SLO exactly."], "correct_index": 2}}, {"id": "cloud-0020", "title": "The 4x Data Cost Bug", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the impact of this OTA change on daily ingest volume for the 100-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~240 TB per day", "~48 TB per day", "~480 TB per day", "~4.8 TB per day"], "correct_index": 2}}, {"id": "cloud-0022", "title": "The Continuous Batching Queue", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is the service stable at 5 RPS with 128 generated tokens per request, and what is the average request time in the system?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["160 ms. The system is under capacity, so latency is simply the time it takes to service one request.", "200 ms. The latency is the service time plus a 20% overhead for being busy.", "800 ms. The system is 80% utilized, leading to significant queueing delay which quintuples the average latency.", "The system is unstable and will crash, because the required token rate is too close to the maximum."], "correct_index": 2}}, {"id": "cloud-0023", "title": "The Blue/Green Memory Squeeze", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a single 80 GB H100 support a blue/green deployment switching from a 15B to a 30B FP16 model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, the new 60 GB model fits within the 80 GB capacity.", "Yes, the total memory needed is only 45 GB (15 GB + 30 GB).", "No, the combined 90 GB footprint exceeds the 80 GB capacity.", "Yes, there is 50 GB free, and the new model is only 30 GB larger."], "correct_index": 2}}, {"id": "cloud-0024", "title": "The True Cost of an A/B Test", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the FP16 memory footprint difference between the 1B and 7B models, and what is the serving cost implication?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["24 GB. The experiment will have a higher memory cost due to using 4 bytes per parameter.", "6 GB. The memory increase is manageable as it only requires 1 byte per parameter.", "12 GB. The experimental model requires 7x more memory, significantly increasing the serving cost per user in the A/B test.", "14 GB. The experimental model needs 14 GB, which is the primary cost driver."], "correct_index": 3}}, {"id": "cloud-0025", "title": "The KV-Cache VRAM Budget (cloud-0025)", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can this request be handled by a single H100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~140 GB. It will not fit, as this is larger than the H100's 80GB VRAM.", "~84 GB. It will just fit, but leaves no room for the model weights.", "~335 GB. It will not fit, as this is over 4x the H100's 80GB VRAM.", "~42 GB. It will require at least half of an H100's 80GB VRAM for a single request's KV-cache alone."], "correct_index": 3}}, {"id": "cloud-0026", "title": "The Continuous Batching Dilemma", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What happens to TTFT and token throughput if the batching wait window increases from 20ms to 100ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Average wait time decreases by 10ms.", "Average wait time remains 20ms.", "Average wait time increases by 40ms.", "Average wait time increases by 100ms."], "correct_index": 2}}, {"id": "cloud-0027", "title": "The Little's Law Bottleneck", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What happens to queue wait time as arrivals approach the 100 req/s service rate with a 10ms first-token service time?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 1}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Wait time scales linearly, reaching 99ms at 99 req/s, staying just under the 100ms SLO.", "Wait time remains near 10ms (the service time) as long as arrival rate < 100 req/s.", "Wait time grows exponentially, hitting 200ms at 95 req/s and violating the 100ms SLO.", "Wait time is bound by the 10ms compute time, allowing 99 req/s safely."], "correct_index": 2}}, {"id": "cloud-0028", "title": "The Interconnect Latency Ladder", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of these communication links typically has the highest intrinsic latency, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink 4.0 Transfer (GPU-GPU, within server)", "PCIe Gen5 Transfer (CPU-GPU, within server)", "InfiniBand NDR Transfer (server-to-server)", "HBM3 Memory Access"], "correct_index": 2}}, {"id": "cloud-0031", "title": "The KV Cache Memory Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a 7B FP16 model with a 128k-token request on an 80 GB GPU, what component is the primary OOM driver?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model parameters (weights)", "Optimizer state (e.g., Adam)", "The KV Cache", "Intermediate activations for the final token"], "correct_index": 2}}, {"id": "cloud-0033", "title": "The Runaway KV-Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How large is the FP16 KV-cache for the 64,000-token request, and why does it OOM an 80 GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~84 GB. This exceeds the available memory.", "~2.6 MB. This is negligible and shouldn't cause an OOM.", "~168 GB. The cache size far exceeds the remaining VRAM.", "35 GB. The memory is determined by the weights, so the error must be from fragmentation."], "correct_index": 2}}, {"id": "cloud-0035", "title": "The RAG Index Rollout", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If all 100 pods attempt to download the 10 GB file simultaneously from your artifact storage, how long would the download phase take?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.5 seconds", "0.2 seconds", "20 seconds", "0.025 seconds"], "correct_index": 2}}, {"id": "cloud-0036", "title": "The Head-of-Line Blocking Problem", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary latency-related problem that continuous batching is designed to solve compared to static batching?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 0}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It primarily increases the maximum theoretical throughput (tokens/sec) of the GPU.", "It solves head-of-line blocking, where short requests are stuck waiting for the longest request in a batch to complete.", "It reduces the VRAM required for the KV cache by using a different compression algorithm.", "It strictly processes requests in a first-in, first-out (FIFO) order to ensure fairness."], "correct_index": 1}}, {"id": "cloud-0041", "title": "The Batching Tipping Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the batch-1 arithmetic intensity, and how does increasing batch size shift the workload from memory-bound to compute-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The workload remains memory-bound because Arithmetic Intensity is constant; both compute and memory scale linearly.", "The workload is compute-bound at batch 1 and just becomes more compute-bound at batch 64.", "The workload shifts from memory-bound to compute-bound as the AI increases from ~33 to ~310 Ops/Byte.", "The workload becomes compute-bound, but its AI decreases because the memory grows faster than the compute."], "correct_index": 2}}, {"id": "cloud-0043", "title": "The Chatbot's Response Time", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long until the user receives the 20th output token with 150ms prefill and 30ms TPOT?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["600ms", "180ms", "750ms", "780ms"], "correct_index": 2}}, {"id": "cloud-0045", "title": "The Iceberg of Inference Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the dominant, long-term cost for this 24/7 RAG system?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 0}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time cost of training the 7B model.", "The annual cost of storing the vector database embeddings.", "The annual cost of running the GPU for 24/7 inference.", "The network bandwidth costs for handling user queries."], "correct_index": 2}}, {"id": "cloud-0046", "title": "The KV-Cache Memory Bomb: VRAM Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory is required strictly for the KV-cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~5.3 GB (Calculated Trap: Forgets either the K/V pair multiplier or 2-byte FP16 multiplier)", "~2.6 GB (Calculated Trap: Forgets both the K/V multiplier and 2-byte FP16 multiplier)", "~80 GB (Calculated Trap: Confuses KV-cache size with total model weight footprint)", "~10.7 GB"], "correct_index": 3}}, {"id": "cloud-0047", "title": "The Static Batching Latency Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For a request arriving just after the 50ms static batching window opens, what components make up its worst-case TTFT?", "chain_ids": ["cloud-chain-auto-001-05"], "chain_positions": {"cloud-chain-auto-001-05": 0}, "chain_tiers": {"cloud-chain-auto-001-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["25ms", "50ms", "75ms", "100ms"], "correct_index": 2}}, {"id": "cloud-0048", "title": "The RAG Retrieval Bottleneck", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the most likely source of the RAG chatbot's high latency: LLM HBM access, network hop, or NVMe vector DB read?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reading documents from the NVMe SSD vector database", "LLM forward pass memory access to HBM", "Network transfer to the database server via InfiniBand", "L2 cache misses on the GPU during the forward pass"], "correct_index": 0}}, {"id": "cloud-0049", "title": "The Batching Window Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With a 200ms P99 TTFT SLO, 30ms network latency, and 50ms GPU time, what is the maximum queue wait for batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["150 ms", "170 ms", "120 ms", "200 ms"], "correct_index": 2}}, {"id": "cloud-0052", "title": "The Rollout Memory Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total additional HBM is required across 500 FP16 instances when upgrading from a 7B to a 13B model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3 TB", "13 TB", "6 TB", "12 TB"], "correct_index": 2}}, {"id": "cloud-0053", "title": "The Static Batching Waiting Game", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the average queue wait before processing for batch size 8 when requests arrive every 150ms?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 0}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100ms", "1050ms", "525ms", "800ms"], "correct_index": 2}}, {"id": "cloud-0055", "title": "The Concurrent User Limit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many concurrent users can one GPU support for a 13B FP16 model at 50% peak if each needs 64 tokens/s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 148 users", "About 297 users", "About 594 users", "About 19,019 users"], "correct_index": 1}}, {"id": "cloud-0056", "title": "The On-Node Interconnect Tax: NVLink vs. PCIe", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a data transfer over a standard PCIe Gen5 bus compared to a direct GPU-to-GPU transfer using NVLink 4.0?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About the same latency", "~10x slower", "~2x slower", "~18x slower"], "correct_index": 2}}, {"id": "cloud-0057", "title": "The FP16 Inference Memory Rule", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the minimum RAM or HBM needed just to load Llama-3-8B weights in standard FP16 precision?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 GB", "128 GB", "16 GB", "2 GB"], "correct_index": 2}}, {"id": "cloud-0058", "title": "The Continuous Batching Deadline: Batching Strategies", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the worst-case TTFT when a request arrives just after an iteration starts, and does it meet the 150ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40ms. The TTFT is determined solely by the prefill computation time.", "10ms. The TTFT is equivalent to the Time Per Output Token (TPOT).", "50ms. The TTFT is the prefill time plus the worst-case wait for the next iteration cycle.", "Up to several seconds. The request must wait for the longest sequence in the current batch to complete."], "correct_index": 2}}, {"id": "cloud-0059", "title": "The Basic Inference Memory Footprint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Before even considering the KV cache or the retrieval index, what is the minimum memory required just to load the model's weights for inference using FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~7 GB", "~112 GB", "~14 GB", "~2 GB"], "correct_index": 2}}, {"id": "cloud-0061", "title": "The RAG Update Bottleneck", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Assuming you have to ship the entire 200 GB file, how long would it take to transfer the index to a single serving pod over 400 Gbps InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["32 seconds", "2.7 minutes", "4 seconds", "0.5 seconds"], "correct_index": 2}}, {"id": "cloud-0062", "title": "The Real-Time Batching Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What maximum batch size fits a 100ms SLO for generating 50 tokens on a 7B model using 50% of H100 peak FP16 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["142", "5", "70", "23"], "correct_index": 2}}, {"id": "cloud-0063", "title": "The Blue-Green Memory Tax: Model Serving Infrastructure", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total FP16 HBM is consumed just by the 7B and 13B model weights during the blue-green transition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["26 GB", "20 GB", "40 GB", "80 GB"], "correct_index": 2}}, {"id": "cloud-0064", "title": "The Batching Window Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the worst-case TTFT with 150ms slotted batching and 80ms prefill, and does it satisfy the 200ms real-time requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["80 ms. The TTFT is simply the pre-fill computation time.", "150 ms. The TTFT is determined by the batching window, as it's the longest delay.", "230 ms. The worst case is the full batch window delay plus the pre-fill time.", "70 ms. The available time is the batch window minus the compute time."], "correct_index": 2}}, {"id": "cloud-0065", "title": "The Cold Start Penalty", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much slower is this initial read from the SSD compared to a subsequent read from HBM?", "chain_ids": ["cloud-chain-auto-001-02"], "chain_positions": {"cloud-chain-auto-001-02": 0}, "chain_tiers": {"cloud-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~30x slower", "~3x slower", "~330x slower", "~3,300x slower"], "correct_index": 2}}, {"id": "cloud-0066", "title": "The Canary Memory Footprint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much inference memory does the 70B FP16 model need, and can it fit on a single H100 GPU?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 0}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model requires 70 GB, so it will fit on the 80 GB H100.", "The model requires 140 GB; you just need to update the pod's memory request in Kubernetes.", "The model requires 140 GB, which exceeds the H100's 80 GB. A multi-GPU strategy is now required.", "The model requires over 1.1 TB to store optimizer states, making it impossible to serve."], "correct_index": 2}}, {"id": "cloud-0067", "title": "The Static Batching Throughput Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the largest static batch size that meets the 200ms TTFT SLA, and what token throughput does the 10ms decode step provide?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Max Batch: 20, Throughput: 2000 tokens/sec", "Max Batch: 13, Throughput: 1300 tokens/sec", "Max Batch: 10, Throughput: 1000 tokens/sec", "Max Batch: 10, Throughput: 100 tokens/sec"], "correct_index": 2}}, {"id": "cloud-0068", "title": "The On-Node Interconnect Ladder", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which interconnects most likely correspond to these two latencies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ~500ns transfer is via L2 Cache (calculating ~1.5ns); the ~1,000ns transfer is via NVLink 4.0.", "The ~500ns transfer is via NVMe SSD read (calculating ~10,000ns); the ~1,000ns transfer is via PCIe Gen5.", "The ~500ns transfer is via NVLink 4.0; the ~1,000ns transfer is via PCIe Gen5.", "The ~500ns transfer is via InfiniBand NDR (calculating ~2,000ns); the ~1,000ns transfer is via NVLink 4.0."], "correct_index": 2}}, {"id": "cloud-0069", "title": "The RAG Retrieval Tax", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate latency for a single random read from an NVMe SSD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 ns (HBM Memory Access)", "~5 \\u00b5s (Cross-Rack InfiniBand)", "~100 \\u00b5s (NVMe SSD Read)", "~40 ms (Cross-Country Fiber)"], "correct_index": 2}}, {"id": "cloud-0072", "title": "The Inference Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For basic inference using half-precision (FP16), how much GPU memory should you budget just to load the model weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "28 GB", "112 GB", "14 GB"], "correct_index": 3}}, {"id": "cloud-0075", "title": "The Static Batching Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the user's Time-To-First-Token (TTFT)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10ms", "40ms", "50ms", "30ms"], "correct_index": 2}}, {"id": "cloud-0076", "title": "The On-Node vs. Off-Node Divide", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which is faster for a small payload, same-server GPUs over NVLink or different servers over InfiniBand, and by roughly what factor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly the same speed.", "InfiniBand is ~2x faster.", "NVLink is ~10x faster.", "NVLink is ~100x faster."], "correct_index": 2}}, {"id": "cloud-0077", "title": "The FP16 Inference Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To serve this model for inference, approximately how much GPU memory is required to hold just the model weights in standard half-precision (FP16)?", "chain_ids": ["cloud-chain-auto-016-02"], "chain_positions": {"cloud-chain-auto-016-02": 0}, "chain_tiers": {"cloud-chain-auto-016-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "14 GB", "28 GB", "112 GB"], "correct_index": 1}}, {"id": "cloud-0079", "title": "The On-Node vs. Off-Node Latency Chasm", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much slower is a cross-rack InfiniBand NDR transfer compared to an on-node NVLink 4.0 transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand is about the same latency as NVLink.", "InfiniBand is about 100x slower than NVLink.", "InfiniBand is about 10x slower than NVLink.", "InfiniBand is about 2x faster than NVLink."], "correct_index": 2}}, {"id": "cloud-0080", "title": "The 7B Model Memory Footprint: Model Serving Infrastructure", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What absolute minimum memory is required to load the 7B model weights for FP16 inference before orchestration or KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["112 GB", "7 GB", "14 GB", "28 GB"], "correct_index": 2}}, {"id": "cloud-0081", "title": "The Canary Rollout Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What minimum VRAM is required to load the new 13B model for FP16 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["13 GB", "52 GB", "26 GB", "208 GB"], "correct_index": 2}}, {"id": "cloud-0082", "title": "The Continuous Batching Throughput Limit", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Using Little's Law, how would you estimate peak requests/sec and tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~66 QPS", "~1.7 QPS", "~16.5 QPS", "~2,112 QPS"], "correct_index": 2}}, {"id": "cloud-0084", "title": "The Static Batching Timeout Trap", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the TTFT components for this idle static-batching case, and what best-case TTFT will the user see?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5ms. (Calculated Trap: Only decoding time)", "50ms. (Calculated Trap: Only prefill time, missing timeout)", "250ms.", "255ms. (Calculated Trap: Prefill + decoding + missing timeout)"], "correct_index": 2}}, {"id": "cloud-0086", "title": "The Blue/Green Capacity Trap", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many H100 GPUs must you provision at peak for a blue-green rollout of this 10-replica deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 GPUs. The new model replaces the old one, so the same number of GPUs is sufficient.", "11 GPUs. You just need one extra GPU to start the rollout, and the orchestrator will handle the rest.", "20 GPUs. The entire new 'green' deployment must run in parallel with the old 'blue' deployment before traffic is switched.", "19 GPUs. The new model is roughly twice as large (13B/7B), so you need about twice the GPUs, but you can reuse one from the old fleet."], "correct_index": 2}}, {"id": "cloud-0087", "title": "The Real-Time Dilemma: TTFT vs. Throughput", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With an 80ms batching window and 40ms prefill, what worst-case TTFT should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40ms", "80ms", "120ms", "200ms"], "correct_index": 2}}, {"id": "cloud-0088", "title": "The On-Node vs. Cross-Node Latency Jump", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a cross-rack InfiniBand NDR transfer compared to a transfer between two GPUs on the same server using NVLink 4.0?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 0}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x slower", "~10x slower", "~100x slower", "~1000x slower"], "correct_index": 1}}, {"id": "cloud-0089", "title": "The RAG Latency Trap: Compound AI Systems", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of these two stages is the dominant source of latency in the RAG pipeline?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 0}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The vector database lookup, because disk I/O is fundamentally slower than on-chip computation.", "The LLM generation, because it is an intensely memory-bandwidth-bound operation.", "They are roughly equal, with compute and I/O taking about the same amount of time.", "The network transfer between the database and the LLM server."], "correct_index": 1}}, {"id": "cloud-0092", "title": "The Real-Time Translation Bottleneck", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To understand the system's limits, what is the theoretical minimum baseline Time Per Output Token (TPOT) for a single, isolated user?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Less than 1 ms. The operation is compute-bound, limited by the H100's PetaFLOP-scale compute.", "Approximately 95 ms. The 3.35 TB/s bandwidth is measured in Terabits, not TeraBytes, reducing effective bandwidth.", "Approximately 12 ms. The operation is memory-bound by the time it takes to read 40 GB of weights over the 3.35 TB/s HBM interface.", "Around 300 ns. This is the fundamental latency of a single HBM3 memory access."], "correct_index": 2}}, {"id": "cloud-0093", "title": "The Serving Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What fundamental priority did we fail to invert?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0094", "title": "The P99 Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the P99 TTFT spikes above 500ms despite a busy GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0095", "title": "The Throughput Trap", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why can a high-throughput static-batched server still feel laggy when TPOT is ~12ms but TTFT is high?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0096", "title": "The Memory Wall of Long Contexts", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why can a few 16k-token conversations cause an OOM on an H100 system that comfortably handles 64 short-prompt users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0098", "title": "The Interactive API Latency Spike", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of the P99 TTFT exceeding 500ms with static batching and spiky GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0099", "title": "The Code-Gen Throughput Ceiling", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What approximate throughput improvement can continuous batching provide for output lengths [20, 50, 80, 100, 150, 200, 300, 1000]?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0100", "title": "The Unstable Translation Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can the FIFO single-worker system meet the 200ms TTFT deadline at 10 requests/sec, and why?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 2}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0101", "title": "The Continuous Batching OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much KV-cache capacity remains after loading the 70B INT8 weights, and why can long contexts OOM the GPU?", "chain_ids": ["cloud-chain-auto-008-18"], "chain_positions": {"cloud-chain-auto-008-18": 0}, "chain_tiers": {"cloud-chain-auto-008-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0102", "title": "The Interactive Playground Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is a <200ms TTFT mathematically feasible for a 70B model, and what is the primary hardware bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0103", "title": "The High-Throughput API Crisis", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this poor performance despite the GPU being fully utilized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0104", "title": "The Real-Time Ad Bidding SLA", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the current single-server ad bidding system meet the 99.9% under 50ms SLA with 5% of requests taking 80ms, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0107", "title": "The Continuous Batching Hiccup", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely design flaw in your continuous batching scheduler causing these latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0108", "title": "The Live Caption Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum number of H100 servers (c) needed to keep 99% of 10-second transcription chunks under 500ms at 3 RPS, based on M/M/c queuing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0109", "title": "The TPOT Trade-off", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much computation does static batching waste by padding 64 requests to 512 tokens, and what throughput gain should continuous batching give?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 2}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0110", "title": "The Algorithmic Trading Deadline", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely bottleneck causing the missed deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0111", "title": "The Chatbot Throughput Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using queueing theory, what is the fundamental state of this system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is memory-bound due to the KV cache size for 16 users.", "The system is operating in a stable but highly-loaded state (\\rho \\approx 99%).", "The network is saturated from streaming back responses for 16 users simultaneously.", "The system is unstable (\\rho > 1) because the arrival rate is higher than the service rate."], "correct_index": 3}}, {"id": "cloud-0112", "title": "The Prefill-Decode Collision", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why would a short prompt be forced to wait so long in a continuous batching system?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 0}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The KV cache ran out of memory, forcing an eviction/recomputation cycle.", "The continuous batching scheduler's overhead becomes too high with long sequences.", "The long-context prefill is a monolithic, non-preemptive operation that blocks new requests.", "The inter-token latency (TPOT) for the long-context request slowed down the whole system."], "correct_index": 2}}, {"id": "cloud-0113", "title": "The Static Batching TTFT Penalty", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is TTFT so high for ~100-token prompts when statically batched to 4096 tokens on an H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The network latency between the web server and the H100 is too high.", "The 7B parameter model is too large, causing slow weight loading from VRAM.", "Static batching forces every short prompt to pay the full computational cost of the 4096-token context window.", "The batch size is too small, which underutilizes the H100's Tensor Cores."], "correct_index": 2}}, {"id": "cloud-0114", "title": "The Throughput Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If VRAM is not the issue, what queueing effect explains why batch size 64 makes P99 latency skyrocket and effective throughput decrease?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU has hit a Memory-Capacity-Bound limit, running out of VRAM for the batch of 64.", "The system is Queue-Bound; utilization (rho) exceeds 1.0, causing head-of-line blocking and wait times to explode.", "The H100 is Bandwidth-Bound; memory bandwidth saturation strictly limits throughput to 2133 req/s.", "The CPU is Single-Thread-Bound, restricting the batch preparation rate to 50% of the GPU's capacity."], "correct_index": 1}}, {"id": "cloud-0115", "title": "The 60% Utilization Mystery", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely reason for this phenomenon?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU is memory-bandwidth bound, so compute cores are idle waiting for HBM. (Trap: assumes HBM limits utilization metric, but nvidia-smi util is time-based, not bandwidth-based)", "The Python GIL in the server is preventing the scheduler from running in parallel with GPU execution. (Trap: GIL affects threading, but continuous batching runs mostly in C++ backend)", "The request arrival rate is not high enough to fully saturate the server's capacity. (Trap: contradicts \"queue is full\" in scenario)", "The GPU is frequently idle, waiting for the CPU-bound scheduler to manage requests and memory between batches."], "correct_index": 3}}, {"id": "cloud-0116", "title": "The Reinforcement Learning Latency Stall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely bottleneck when sending 500,000 tiny CPU observation tensors per second to one GPU over PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 ms wasted (Bandwidth-Bound)", "50 ms wasted (NVLink-Bound)", "500 ms wasted (Transaction-Overhead-Bound)", "5000 ms wasted (Memory-Bound)"], "correct_index": 2}}, {"id": "cloud-0117", "title": "The Translation API Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What batching behavior most likely causes the 1.5s P99 TTFT despite good TPOT in the H100 translation API?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100 compute is underutilized; switching to a smaller, cheaper GPU would be more efficient.", "The network latency between the load balancer and the inference servers must be spiking to over 1 second.", "The 1000ms batching timeout is too high for the low arrival rate, causing requests to wait too long in the queue before processing.", "The model is too large, causing slow cold starts during token generation, which increases the time to the first token."], "correct_index": 2}}, {"id": "cloud-0118", "title": "The P99 Latency Explosion", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely queueing cause of P99 TTFT exceeding 900ms with 20 RPS and static batching of 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The batch size is too small, resulting in inefficient, low-throughput GPU kernels.", "The H100 GPU is not powerful enough to handle 20 RPS, causing a persistent backlog of requests.", "Head-of-line blocking from the static batching strategy is creating extreme queueing delays for some requests.", "The network connection to the NVMe drives used for swapping KV-cache is saturated."], "correct_index": 2}}, {"id": "cloud-0120", "title": "The Chatbot's Awkward Silence", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is causing P99 TTFT over 800ms despite high GPU utilization and excellent TPOT with a 500ms static batching window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 70B model's prefill computation is too slow for the H100 GPU, creating a compute bottleneck.", "The system needs more H100 GPUs to handle the request volume and reduce queueing delays.", "The static batching window (500ms) forces requests to wait artificially, which is the primary contributor to TTFT.", "Network latency between the user and the datacenter is the most likely cause for the >800ms delay."], "correct_index": 2}}, {"id": "cloud-0121", "title": "The Unstable Chatbot Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely root cause of excessive TTFT with 10 RPS, a 400ms static batching timeout, batch size 4, and 450ms prefill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 450ms processing time is too slow for the GPU, indicating a compute bottleneck.", "The 400ms batching timeout is too short, preventing the formation of larger, more efficient batches.", "The arrival rate (10 RPS) exceeds the system's maximum service rate (~8.88 RPS), causing an unstable and ever-growing request queue.", "Network latency between the user and the datacenter is the primary contributor to the 800ms+ TTFT."], "correct_index": 2}}, {"id": "cloud-0122", "title": "The Chatbot Timeout Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary cause of the 800ms P99 TTFT with static batching of 32 and 450ms full-batch forward passes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU is not powerful enough. We should upgrade to reduce the batch processing time from 450ms.", "The static batching policy is causing head-of-line blocking. We should implement continuous batching to eliminate queuing delay.", "The 13B model is too large. We should quantize the model to INT8 to decrease the per-batch inference time.", "The issue is inefficient token generation. We should implement speculative decoding to improve Time Per Output Token (TPOT)."], "correct_index": 1}}, {"id": "cloud-0123", "title": "The Chatbot's Silent Wait", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is most likely causing the 800ms P99 TTFT when the GPU is only 60% utilized and static batching uses a 200ms timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100 GPU is not powerful enough for this load. We should upgrade to a B200 to reduce the per-batch compute time.", "The model's prefill computation is the bottleneck. We should apply INT8 quantization to reduce the TFLOPs required.", "The fixed 200ms batching timeout is causing excessive queueing delay; requests wait idly instead of being processed.", "The bottleneck is network I/O from fetching user data for each request, causing the serving process to block before batching."], "correct_index": 2}}, {"id": "cloud-0124", "title": "The Overloaded Translator", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is TTFT P99 climbing steadily past 2 seconds at 12 RPS despite only 65% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The static batch size is too large, increasing per-batch latency. Reducing it to 8 would lower TTFT.", "The H100 GPU is underpowered for this model. GPU utilization would be 100% if it were the bottleneck.", "The system is overloaded because its maximum throughput is lower than the arrival rate, causing the request queue to grow. Static batching is artificially depressing throughput.", "The 40ms prefill latency is the primary bottleneck. Optimizing the data input path is the highest priority."], "correct_index": 2}}, {"id": "cloud-0125", "title": "The Chatbot Lag Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What explains the huge gap between 150ms P50 TTFT and 2.5s P99 TTFT with static batching of 64 at 95% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The prefill computation is stalling at 2.15s due to unoptimized attention kernels. (Mistaking queue time for compute time).", "The system is experiencing severe head-of-line blocking due to its static batching policy, causing 2.0s queue delays.", "The GPUs are throttling, causing a 16x degradation from 150ms to 2500ms. (Ignoring the static batching pipeline).", "Network I/O is saturated at the ingress, dropping the request packets. (Misdiagnosing system-level queueing as network congestion)."], "correct_index": 1}}, {"id": "cloud-0126", "title": "The Chatbot SLO Catastrophe", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the core problem causing the 500ms P99 TTFT SLO breach in this LLM chat service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100s are underpowered for this workload; the high utilization proves they can't keep up with the request volume.", "The InfiniBand network latency is adding too much overhead, causing requests to miss their deadline.", "Head-of-line blocking from static batching is causing massive queueing delays, and the average request wait time is 10x the SLO.", "GPU utilization is too high, leading to thermal throttling. We should reduce the batch size to give the GPU recovery time."], "correct_index": 2}}, {"id": "cloud-0128", "title": "The Translation API's Latency Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What causes the 250ms TTFT SLA miss when static batching adds a 200ms window on top of 150ms prefill time?", "chain_ids": ["cloud-chain-auto-001-05"], "chain_positions": {"cloud-chain-auto-001-05": 1}, "chain_tiers": {"cloud-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The TTFT is 150ms + 50 tokens * 20ms = 1150ms, which means the model decode speed is the bottleneck.", "The system is experiencing head-of-line blocking. A request can wait in the queue for up to 200ms before processing even begins, pushing the total TTFT to 350ms.", "The request arrival rate is too high, overwhelming the system. The service needs more GPU replicas to handle the load.", "The TTFT is exactly 150ms since it does not include queue time, indicating a network latency issue."], "correct_index": 1}}, {"id": "cloud-0130", "title": "The Chatbot Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling issue explains low 40% GPU utilization and P99 TTFT over 200ms when average TTFT is only 30ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100's memory bandwidth is insufficient for the 7B model, making the memory-bound decode step (TPOT) the bottleneck.", "The prefill computation for processing the input prompt is too slow, making the service compute-bound on the GPU.", "The static batching timeout creates head-of-line blocking and inefficient small batches, leading to high queueing delay and low GPU utilization.", "Network latency for incoming requests is highly variable, and the serving system has no control over this external factor."], "correct_index": 2}}, {"id": "cloud-0131", "title": "The Chatbot Latency Crisis (cloud-0131)", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the 100ms static batching timeout is removed with continuous batching, what should happen to P99 TTFT and why?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 2}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Slightly worse, ~110ms. The smaller batch size has lower arithmetic intensity, reducing MFU and making each step slower, which dominates any queueing gains.", "~53ms. The average batch size is halved (32 -> 16), so the system's throughput is halved, and thus latency must also be halved.", "~105ms. The GPU is the fundamental bottleneck. Serving policy doesn't change the time it takes to compute a token, so the TTFT will remain the same.", "~1-5ms. The 100ms static batching timeout (T_queue) is eliminated. The new latency is simply the compute time of the prefill step, which is on the order of milliseconds."], "correct_index": 3}}, {"id": "cloud-0133", "title": "The Chatbot Lag Catastrophe", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the P99 TTFT violation with a 150ms static batching timeout and only 40% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100's memory bandwidth is insufficient, causing a bottleneck when loading model weights for each batch. We should use tensor parallelism to split the model across multiple GPUs.", "The static batching timeout is too short. We should increase it to 300ms to create larger, more efficient batches, which will increase the 40% GPU utilization.", "The system is experiencing head-of-line blocking due to static batching, where new requests are stuck waiting for long-running batches to complete. Switching to continuous batching would solve this.", "The PCIe bus is saturated, preventing the CPU from feeding data to the H100 fast enough, which explains the low 40% utilization."], "correct_index": 2}}, {"id": "cloud-0135", "title": "The Translation Service Traffic Jam", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the P99 TTFT spikes above 500ms with a 100ms static batching timeout and 90% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 13B model exceeds the H100's SRAM cache, forcing 500ms roundtrips to HBM.", "The static batching timeout is causing head-of-line blocking. Switch to continuous batching.", "The 90% utilization proves the GPU is compute-bound. Downsize the model to 7B.", "The 100ms timeout is too short to assemble optimal batches. Increase it to 500ms."], "correct_index": 1}}, {"id": "cloud-0137", "title": "The LLM Metrics", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What specific generation metrics are we failing to monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0138", "title": "The Shadow GPU Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is shadow-testing an LLM so much more expensive than shadow-testing a traditional ML model, and how do you make it feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0139", "title": "The Guardrail Latency Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the PM wrong, and how do you fix it without removing the guardrail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0140", "title": "The Feature Store Consistency Trap", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 15% value divergence between the online Redis and offline Parquet feature paths?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0141", "title": "The P99 Latency Anomaly", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can TPOT stay stable while P99 TTFT explodes under static batching, and what is the root cause of the TTFT explosion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0142", "title": "The Continuous Batching Plateau", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What practical system constraint is preventing them from reaching the theoretical throughput gains from continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0144", "title": "The ROI of Heterogeneity", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much throughput could continuous batching recover over static batching for a workload with 95% short and 5% 4k/4k long requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0145", "title": "The SLO Squeeze: Interactive vs. Batch Throughput", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you use priority scheduling or partition the 8 GPUs between chat and batch jobs, and what trade-off does that create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0146", "title": "The Continuous Batching Tail Latency Paradox", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are 4000-token prompts causing 1.5-2s P99 TTFT spikes in the continuous batching loop, and how would you mitigate them?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 1}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0147", "title": "The Throttling Dilemma: Per-User vs. Global Queueing", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do global FIFO and per-user fair queuing differ under a burst from one power user in a multi-tenant LLM API?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0148", "title": "The P99 Latency Volcano", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 latency exploding near 50% of max throughput with static batch size 8, and should you add GPUs or change batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0149", "title": "The Two-Tier Traffic Jam", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a single continuous-batching configuration fail to optimize both chat TTFT and summarization TPOT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0150", "title": "The Priority Queue Impasse", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do interactive users see multi-second TTFT in a FIFO continuous batcher at 100% GPU utilization, and what scheduling fix would you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0151", "title": "The KV Cache Thrashing Cascade", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this continuous-batched cluster compute-bound or KV-cache-capacity-bound, and what cascading failure explains the 30% RPS loss and P99 spikes?", "chain_ids": ["cloud-chain-auto-008-18"], "chain_positions": {"cloud-chain-auto-008-18": 1}, "chain_tiers": {"cloud-chain-auto-008-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0152", "title": "The Deadline-Missing Detector", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a system with 90% average utilization failing so badly, and what queueing strategy would you recommend?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 3}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0153", "title": "The Cannibalistic Batching Strategy", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the batching strategy cannibalizing its own efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0154", "title": "The Head-of-Line Blocking Crisis", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What flaw in the fixed batch size 8 static batching stack is causing poor P99 TTFT, and why is continuous batching better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0156", "title": "The Prefill vs. Decode Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might doubling batch size improve aggregate TPOT but harm TTFT and user experience for the code completion assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0159", "title": "The Coding Assistant's Latency Crisis", "topic": "activation-memory", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is driving the 1500ms P99 TTFT despite excellent throughput, and why is the 1000ms static batching timeout the culprit?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 3}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0160", "title": "The Tyranny of Throughput", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do high GPU utilization and TPS still produce lag under a 150ms static batching timeout with a 200ms TTFT SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0161", "title": "The Code Assistant's Latency Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What queueing failure causes the 13B code assistant to miss its 200ms P99 TTFT SLO despite fast per-request inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0162", "title": "The Real-Time Voice Assistant Stutter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is increasing static batch size from 8 to 32 the wrong fix for 40% GPU utilization, and what batching change should you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 13B model exceeds the H100's SRAM cache, forcing 500ms roundtrips to HBM.", "The static batching timeout is causing head-of-line blocking. Switch to continuous batching.", "The 90% utilization proves the GPU is compute-bound. Downsize the model to 7B.", "The 100ms timeout is too short to assemble optimal batches. Increase it to 500ms."], "correct_index": 1}}, {"id": "cloud-0163", "title": "The Black Friday Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did the system fail non-linearly, and how does continuous batching on GPUs shift this queueing knee point differently than traditional CPU-based web serving?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 4}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0164", "title": "The Input Chunking Pipeline Bubble", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does chunking the prompt slow down the time to first token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0166", "title": "The SLO-Violating Deadline Scheduler", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the deadline scheduler failing long jobs despite 50% utilization, and what statistical phenomenon is being ignored?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0167", "title": "The Speculative Decoding Memory Bomb", "topic": "speculative-decoding", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did an optimization designed to reduce latency cause a catastrophic memory failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0168", "title": "The Continuous Batching Paradox: Batching Strategies", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would a continuous batching strategy catastrophically degrade P99 TTFT for your most sensitive users?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0169", "title": "The Throughput-Optimized Cascade Failure", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did optimizing for a peak hardware metric (TPOT) lead to a catastrophic system-level failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0171", "title": "The In-Flight Priority Queue Failure", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is a priority queue insufficient for guaranteeing the SLA, and what physical constraint of the hardware is it failing to account for?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0172", "title": "The Continuous Batching Paradox", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can throughput be up, but latency for our most important users be so much worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0173", "title": "The TTFT vs. TPOT Tug-of-War", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why can't the 70B code assistant get both low TTFT at batch size 1 and excellent TPOT at large batch size, and what design can satisfy both?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0174", "title": "The Real-Time Queue Collapse", "topic": "graceful-degradation", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did useful throughput fall below baseline during the 3x surge, and what critical deadline-aware feature is missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0176", "title": "The SLO Violation Cascade", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why won't adding replicas fix 150ms chat TTFT when 100k-token summarization prefills run on the same H100 pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0177", "title": "The Continuous Batching Death Spiral", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did an optimization designed for high throughput result in a catastrophic latency failure, and what fundamental law of systems have your team forgotten?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0178", "title": "The High-Priority Queue Stall", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does an optimization designed to increase throughput lead to a catastrophic failure in latency, and what is the primary cause of the excessive delay?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0179", "title": "The Continuous Batching Stall", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What scheduling flaw in continuous batching causes severe TTFT spikes under high load, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0180", "title": "The Multi-Tenant SLO Crisis", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What serving architecture lets latency-sensitive Swift requests avoid being blocked by long Deep prefills on the same H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0181", "title": "The Phoenix False Positive", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you triage the Phoenix false drowsiness alerts and redesign the pipeline to prevent environmental skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0182", "title": "The AI Analyst's Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three scheduling and capacity decisions for serving real-time news and long report jobs simultaneously on a shared cluster without violating the 500ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0183", "title": "The Silent Utilization Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did the 70B service become cost-inefficient after shifting from long document summarization to short conversational queries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0184", "title": "The Earnings Call Meltdown", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does processing one transcription request at a time miss the 500ms P99 TTFT SLO, and what batching architecture fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0185", "title": "The Real-Time Translation Stalemate", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three design decisions to address this P99 latency explosion, and why are they the correct levers to pull?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0186", "title": "The 8 Petabyte Skew Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you redesign the log-to-training pipeline to handle 8 TB/day per vehicle and eliminate C++/Python training-serving skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0187", "title": "The SLA Collision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What serving architecture should replace static monolithic batches, and why is a priority queue insufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0188", "title": "The 'Laggy' Code Assistant: A Batching Design Challenge", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What serving architecture can meet StaffML Code's strict TTFT and TPOT SLOs without the static batching trade-off?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 3}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0189", "title": "The Copilot Latency Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does static batching miss the 200 ms P99 TTFT while leaving GPUs underutilized?", "chain_ids": ["cloud-chain-auto-001-05"], "chain_positions": {"cloud-chain-auto-001-05": 3}, "chain_tiers": {"cloud-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0190", "title": "The Conversational AI Traffic Jam", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is your core architectural choice for request handling and batching to meet a 500ms P99 TTFT with highly variable query lengths, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0191", "title": "The Bi-Modal GPU Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you serve both products on a single B200 without violating SLAs or thrashing GPU memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0192", "title": "The Two-Tier SLA Conundrum", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect serving for the two product tiers on 16 GPUs without one tier hurting the other's SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0193", "title": "The Blackwell Disappointment", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you diagnose and redesign the serving stack to achieve a 3x cost-per-token reduction without latency regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0194", "title": "The Interactive Coding Assistant SLO Catastrophe", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you prove static batching cannot meet the 500 ms TTFT and size capacity under continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0195", "title": "The Underutilized Accelerator", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the fundamental limitation of standard dynamic batching in this scenario, and what advanced technique maximizes GPU throughput and reduces tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0196", "title": "The OOMing Generator", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the most likely cause of these OOMs under high concurrency, and what technique mitigates this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0198", "title": "The Streaming vs Batch Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "When does token streaming reduce perceived latency for the 8-second chatbot response, and when can it hurt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0199", "title": "The Embedding Model Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a high-end GPU the right hardware for a 335M embedding model, and what batch size should you target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0200", "title": "The Warm-up Request Strategy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the first real request 60× slower after health check passes, and how many warm-up requests are needed before production traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0201", "title": "The Tokenizer Mismatch", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does a tokenizer version mismatch cause a massive GPU memory leak, and how does it destroy your serving economics?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 0}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0203", "title": "The Normalization Mismatch", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What's the most likely preprocessing bug?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0204", "title": "The Pre-computation Trade-off", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do we reduce inference compute costs without losing model accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0205", "title": "The Token Budget Economics", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which accelerator has the best cost per million tokens for serving a 70B LLM at realistic batch sizes, and when does that change?", "chain_ids": ["cloud-chain-auto-001-19"], "chain_positions": {"cloud-chain-auto-001-19": 0}, "chain_tiers": {"cloud-chain-auto-001-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0206", "title": "The Inference Server Autoscaling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is keeping one warm 70B replica 24/7 worth the $2,500/month, and what is the break-even versus scale-to-zero?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0207", "title": "The Triton Inference Server Ensemble", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do a Triton ensemble and a single Python process compare for latency and throughput in this RAG pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0208", "title": "The Structured Output Constraint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is this overhead acceptable, and can we reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0209", "title": "The A/B Testing at Scale", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is this statistically unnecessary but infrastructure-ruinous, and how does the GPU memory asymmetry between these models dictate your A/B testing architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0210", "title": "The Rate Limiting Design", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design a fair rate limiter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0211", "title": "The Structured Output Parsing Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the CPU doing that takes so much time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0212", "title": "The Continuous Batching Scheduler", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does continuous batching reduce the 13B model's static-batching latency, and what improvement should you expect?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 3}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0213", "title": "The GC Pause Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's stealing 760 ms from your GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0214", "title": "The TensorRT Incompatibility", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a driver update break a model file, and what's the correct deployment practice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0215", "title": "The BatchNorm Drift", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is silently changing to cause this accuracy drop without weight modifications, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0216", "title": "The One-Replica Meltdown", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is one slow replica destroying the P99 for the entire fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0217", "title": "The CPU Preprocessing Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where's the bottleneck causing the 15 tokens/s rate and sawtooth GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0218", "title": "The GIL Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the throughput plateau at 110 req/s regardless of how many replicas are added, and what is the ceiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0219", "title": "The Multi-Tenant GPU Sharing Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What went wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0220", "title": "The Request Routing Strategy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do autocomplete requests experience 4-second P99 latency on a shared GPU pool, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0223", "title": "The Serverless Inference Trade-off", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the cost break-even point between SageMaker Serverless and a dedicated A10G, and which option wins for this workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0224", "title": "The Continuous Batching Starvation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How did your batching algorithm starve the easy requests?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 4}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0225", "title": "The LLM Canary Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do standard infra metrics miss LLM quality regressions, and how could you use the GPU's KV-cache memory profile as a hardware-level canary signal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0226", "title": "The Multi-Model Serving Platform", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a multi-model serving platform that cuts the $2M/month GPU bill by at least 40% without violating SLAs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0227", "title": "The Speculative Decoding Speedup", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Walk me through the systems math for speculative decoding — when does it help, and when does it backfire?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0228", "title": "The KV-Cache OOM Attack", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does a 64x increase in prompt length from a few users crash the entire cluster, and what's your emergency response?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0229", "title": "The Batching Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do we fix this idle capacity caused by static batching in an LLM API?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0230", "title": "The Speculative Decoding Accept Rate Crash", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why did the Speculative Decoding optimization become a performance penalty for the coding assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0231", "title": "The KV-Cache Context Explosion", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is this physically impossible?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0232", "title": "The Inference Cost Attribution Puzzle", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a fair inference cost attribution model for the five LLMs on the shared 64-A100 vLLM pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0233", "title": "The Speculative Memory Trade-off", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding degrade throughput at high batch sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0235", "title": "Handling KV Cache Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does PagedAttention specifically solve this issue in the KV cache?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 0}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It compresses the key and value tensors using quantization before storing them in the cache.", "It partitions the KV cache into non-contiguous fixed-size blocks, eliminating external fragmentation and allowing dynamic memory allocation per token.", "It proactively evicts the least recently used KV cache tensors to free up contiguous memory blocks for new requests.", "It offloads the KV cache to CPU RAM when GPU memory is fragmented and prefetches it when needed."], "correct_index": 1}}, {"id": "cloud-0236", "title": "Advantages of Continuous Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How does continuous batching improve throughput compared to static batching?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 1}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It increases the clock speed of the GPU dynamically.", "It allows new requests to join the active batch as soon as other sequences complete.", "It batches all requests that have the exact same prompt length together.", "It caches the output tokens to reuse them."], "correct_index": 1}}, {"id": "cloud-0237", "title": "Mechanism of Speculative Decoding", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the fundamental mechanism that allows it to achieve speedup?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 0}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The target model is used only for the first few tokens, and the draft model completes the rest of the sequence to save time.", "The draft model generates multiple tokens sequentially, which the target model then verifies in a single parallel forward pass, accepting correct tokens and correcting the first divergence.", "The draft model and target model generate tokens in parallel, and a majority vote decides which token to output.", "The draft model continuously fine-tunes the target model during inference to make it generate tokens faster."], "correct_index": 1}}, {"id": "cloud-0238", "title": "Applying Little's Law to Inference Servers", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "With \\lambda = 10 requests/s and W = 5 seconds, how many concurrent requests L must the LLM server support to remain stable?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 0}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server needs to support 5 concurrent requests at any given time.", "The server needs to support 15 concurrent requests to have a buffer for peak loads.", "The server needs to support an average of 50 concurrent requests in the system.", "The server needs to support 2 concurrent requests, as 10 divided by 5 is 2."], "correct_index": 2}}, {"id": "cloud-0239", "title": "The Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does the 'ridge point' on a GPU roofline model represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The maximum memory bandwidth (TB/s) of the GPU.", "The maximum theoretical compute performance (TFLOPS).", "The minimum arithmetic intensity (FLOPs/Byte) required to be compute-bound.", "The power consumption (Watts) when the GPU is idle."], "correct_index": 2}}, {"id": "cloud-0240", "title": "Identifying the Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "With arithmetic intensity 40 FLOPs/Byte on an H100 ridge point of 295 FLOPs/Byte, is the workload compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 134 TFLOPS is lower than 989 TFLOPS.", "Memory-bound, because its arithmetic intensity is below the ridge point.", "Compute-bound, because it utilizes less than 15% of peak compute.", "Memory-bound, because 40 FLOPs/Byte exceeds the PCI-e bandwidth limit."], "correct_index": 1}}, {"id": "cloud-0241", "title": "The Role of HBM Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In the context of the GPU roofline model, what aspect of performance does the HBM (High Bandwidth Memory) bandwidth primarily determine?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 0}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The flat part of the roofline (the peak TFLOPS ceiling).", "The total capacity (GB) of the GPU's memory.", "The slope of the roofline for memory-bound workloads.", "The latency (in nanoseconds) of a single L1 cache access."], "correct_index": 2}}, {"id": "cloud-0242", "title": "The 70B Parameter Litmus Test", "topic": "activation-memory", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much memory do the FP16 weights of a 70B parameter model require, and does it fit on a single 80 GB NVIDIA H100?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 1}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["70 GB, so it fits with 10 GB to spare.", "1120 GB, so it does not fit.", "140 GB, so it does not fit.", "280 GB, so it does not fit."], "correct_index": 2}}, {"id": "cloud-0243", "title": "The B200's Architectural Balance", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the B200's ridge point from 2,250 TFLOPS and 8.0 TB/s, and what does it imply for efficient workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~295 Ops/Byte, because it's similar to the H100.", "~2,250 Ops/Byte, assuming bandwidth was specified in Tb/s.", "~0.0035 Bytes/Op, because the ratio was inverted.", "~281 Ops/Byte, indicating it's a compute-bound architecture."], "correct_index": 3}}, {"id": "cloud-0244", "title": "The Chinchilla Time Tax", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Assuming 100% hardware utilization, how long would it take to complete this 5.8x10^23 FLOP training run on a single NVIDIA H100 GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~6.8 days.", "~6,800 days.", "~1,130 days.", "~2,980 days."], "correct_index": 1}}, {"id": "cloud-0245", "title": "The Optimizer's Tax", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much VRAM is needed just for Adam optimizer states and gradients for a 70B-parameter LLM, excluding model weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 GB", "280 GB", "700 GB", "1120 GB"], "correct_index": 2}}, {"id": "cloud-0246", "title": "The Voracious KV-Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary driver of KV-cache memory size when serving an LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The number of model parameters (P)", "The batch size (B)", "The input sequence length (S)", "The GPU's memory bandwidth"], "correct_index": 2}}, {"id": "cloud-0247", "title": "The Activation Memory Bubble", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In what scenario are model activations most likely to exceed model weights in memory usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Serving a 13B LLM with a short context length.", "Training a large CNN on high-resolution images with a large batch size.", "Fine-tuning a 1B parameter model with a small batch size.", "Running inference with a quantized MobileNet on a single image."], "correct_index": 1}}, {"id": "cloud-0248", "title": "The Adam Optimizer Memory Footprint", "topic": "extreme-quantization", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Will the model and optimizer states fit into the 80 GB memory if you ignore activations and KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it fits. It only needs ~16 GB. (Weights Only Trap)", "Yes, it fits. It needs ~64 GB. (Missing Gradients Trap)", "No, it does not fit. It needs ~128 GB.", "Yes, it fits. It needs ~32 GB. (Missing Adam State Trap)"], "correct_index": 2}}, {"id": "cloud-0250", "title": "The Datacenter Power Wall", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate Thermal Design Power (TDP) of a single modern datacenter GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~150 W", "~350 W", "~700 W", "~1200 W"], "correct_index": 2}}, {"id": "cloud-0251", "title": "The Energy Cost of Solitude", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much more energy per request does batch size 1 use than batch size 32 on a 700W H100 with 10ms vs 60ms batch latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's about the same, since power is the same.", "It's about 2x more energy per request.", "It's over 5x more energy per request.", "It's over 30x more energy per request."], "correct_index": 2}}, {"id": "cloud-0253", "title": "Identifying a Memory-Bound Workload", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does this tell you about the workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The workload is compute-bound.", "The workload has an inefficient implementation that should be discarded.", "The workload is memory-bound.", "The workload is perfectly optimized."], "correct_index": 2}}, {"id": "cloud-0254", "title": "Calculating GEMM Kernel Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of this 200 GFLOP GEMM, and is it memory-bound on an H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 FLOPs/Byte", "0.0075 Bytes/FLOP", "133.3 FLOPs/Byte", "400 FLOPs/Byte"], "correct_index": 2}}, {"id": "cloud-0255", "title": "The H100's Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the ridge point from 989 TFLOPS and 3.35 TB/s, and what does it mean for kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.0034 FLOPs/Byte", "295.2 FLOPs/Byte", "295,200 FLOPs/Byte", "3.39 FLOPs/Byte"], "correct_index": 1}}, {"id": "cloud-0256", "title": "The 16x VRAM Multiplier for Training", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much VRAM is required just to store the FP32 weights, gradients, and Adam optimizer states for a 70B-parameter LLM?", "chain_ids": ["cloud-chain-auto-008-16"], "chain_positions": {"cloud-chain-auto-008-16": 0}, "chain_tiers": {"cloud-chain-auto-008-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~280 GB", "~560 GB", "~1120 GB", "~140 GB"], "correct_index": 2}}, {"id": "cloud-0258", "title": "The Mixed Precision Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With mixed-precision Adam for a 7B model, what persistent memory is required, and does it fit on an 80GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["56 GB", "84 GB", "98 GB", "112 GB"], "correct_index": 3}}, {"id": "cloud-0259", "title": "The MoE Compute Fallacy", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How should you estimate Chinchilla-optimal training compute for this 1T-parameter MoE with 2 of 10 experts active per token?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 0}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.2 x 10^26 FLOPs", "~8.0 x 10^23 FLOPs", "~4.8 x 10^24 FLOPs", "~2.4 x 10^25 FLOPs"], "correct_index": 2}}, {"id": "cloud-0260", "title": "The Chinchilla Compute Budget", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a Chinchilla-optimal 100B dense transformer, how many training tokens and FLOPs should you budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4.0 x 10^23 FLOPs", "1.2 x 10^23 FLOPs", "1.2 x 10^24 FLOPs", "6.0 x 10^22 FLOPs"], "correct_index": 2}}, {"id": "cloud-0262", "title": "The TOPS/W Efficiency Metric", "topic": "energy-per-operation", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does TOPS/W mean, and why is it critical for a datacenter architect managing thousands of GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's a measure of peak performance, and higher is always better.", "It's primarily a concern for battery-powered mobile devices, not datacenters.", "It measures compute efficiency, which directly impacts power and cooling costs at scale.", "It defines the maximum thermal output a GPU can sustain before throttling."], "correct_index": 2}}, {"id": "cloud-0264", "title": "The HBM Latency Penalty", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate latency you should recall for a single memory access to HBM3 on a modern datacenter GPU?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 0}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4 ns", "~300 ns", "~1,000 ns (1 µs)", "~40 ns"], "correct_index": 1}}, {"id": "cloud-0265", "title": "The Energy Cost of Precision: Extreme Quantization", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure energy consumption perspective, what is the approximate energy savings per operation if you can perform the compute using INT8 versus FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x more energy", "~3.4x more energy", "~18x more energy", "The energy is roughly the same"], "correct_index": 2}}, {"id": "cloud-0266", "title": "The Inference Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory do the 70B model weights require in FP16 versus INT8, and how many GB are saved by quantizing?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 1}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 GB. (Misconception: Assumed FP32 baseline, but then confused weight footprint calculation)", "210 GB. (Misconception: Assumed FP32 baseline of 280GB and INT8 of 70GB, resulting in 210GB savings)", "70 GB. (Correct calculation: 140 GB for FP16 minus 70 GB for INT8)", "280 GB. (Misconception: Calculated total FP32 model size instead of savings)"], "correct_index": 2}}, {"id": "cloud-0267", "title": "The Chinchilla Data-Compute Ratio", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how many Chinchilla-optimal training tokens are required for a 70B-parameter LLM?", "chain_ids": ["cloud-chain-auto-secondary-013-03"], "chain_positions": {"cloud-chain-auto-secondary-013-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["70 Billion tokens", "3.5 Billion tokens", "1.4 Trillion tokens", "14 Trillion tokens"], "correct_index": 2}}, {"id": "cloud-0268", "title": "The Chinchilla Data Budget", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "According to the compute-optimal Chinchilla scaling laws, approximately how many training tokens should you plan to acquire?", "chain_ids": ["cloud-chain-auto-secondary-013-03"], "chain_positions": {"cloud-chain-auto-secondary-013-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 Trillion tokens", "420 Billion tokens", "1.4 Trillion tokens", "120 Billion tokens"], "correct_index": 2}}, {"id": "cloud-0269", "title": "The 700W Question", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate Thermal Design Power (TDP) you should state for a single modern datacenter GPU, like the NVIDIA H100, to correctly inform the datacenter facilities team?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 0}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["150 W", "350 W", "700 W", "2000 W"], "correct_index": 2}}, {"id": "cloud-0270", "title": "The Datacenter Rack Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many one-H100 servers can fit in a 70 kW rack when each uses a 700W GPU, 300W non-GPU, and the PUE is 1.1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 GPUs. (Calculation: 70,000W / 700W)", "70 GPUs. (Calculation: 70,000W / 1,000W)", "63 GPUs. (Calculation: 70,000W / 1,100W)", "90 GPUs. (Calculation: 70,000W / 770W)"], "correct_index": 2}}, {"id": "cloud-0271", "title": "The Fusion Bottleneck", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When an ML compiler performs operator fusion, what is the primary hardware bottleneck it is designed to reduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Total computational FLOPs", "Cross-node network traffic", "HBM Memory Bandwidth", "Model storage size on disk"], "correct_index": 2}}, {"id": "cloud-0272", "title": "The GPU Failure Cadence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Given a typical Mean Time To Failure (MTTF) of 50,000 hours for a single GPU, what is the expected frequency of a GPU failure somewhere in your fleet?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 0}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About once a month", "About once a week", "About once every 5 hours", "About once every 50,000 hours"], "correct_index": 2}}, {"id": "cloud-0273", "title": "The Inescapable Cost of Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How many GPU failures should you budget for during a 30-day training run on a 10,000-GPU cluster?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 1}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Approximately 6 failures. Calculated by dividing the 30-day run by the 5-hour failure interval.", "Less than 1. The 50,000-hour MTTF of a single GPU makes a failure within a 720-hour run extremely unlikely.", "Approximately 144 failures. The 720-hour run will see a failure roughly every 5 hours.", "About 3,600 failures. Calculated by multiplying the 720-hour run by the 5-hour failure interval."], "correct_index": 2}}, {"id": "cloud-0274", "title": "The Data Loading Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How long will one epoch's 10 TB read from a ~7 GB/s NVMe SSD take, and what bottleneck does that create for the GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~19 hours", "~3.4 minutes", "~24 minutes", "~3.25 hours"], "correct_index": 2}}, {"id": "cloud-0275", "title": "The Iceberg of ML Costs", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over three years, which cost is likely to dominate TCO: the one-time training run or continuous inference, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time model training cost.", "The cumulative cost of running inference.", "The cost of data acquisition and labeling.", "The salaries of the R&D and engineering teams."], "correct_index": 1}}, {"id": "cloud-0276", "title": "The CapEx vs. TCO Fallacy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the estimated first-year TCO for 10 H100s including hardware ($30K/GPU) and 5% maintenance, and why is sticker price misleading?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$300,000", "$301,500", "$315,000", "$330,000"], "correct_index": 2}}, {"id": "cloud-0277", "title": "Defining Arithmetic Intensity: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the definition of Arithmetic Intensity in the context of a GPU roofline model?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The number of operations per second (FLOPs/sec)", "The number of operations per Watt (FLOPs/Watt)", "The ratio of operations to data movement (FLOPs/Byte)", "The total memory bandwidth (GB/s)"], "correct_index": 2}}, {"id": "cloud-0278", "title": "The Meaning of TOPS/W", "topic": "energy-per-operation", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does the TFLOPS/W metric primarily allow you to calculate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The peak theoretical speed of a single GPU", "The compute performance per unit of power, indicating efficiency", "The latency of a single operation", "The speed of the memory subsystem"], "correct_index": 1}}, {"id": "cloud-0279", "title": "The Batch Size 1 Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this ResNet-50 batch-size-1 forward pass compute-bound or memory-bound, and how do you determine that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 250 Ops/Byte is a high arithmetic intensity.", "Memory-bound, because the AI is ~0.004 Bytes/Op, which is far below the ridge point.", "Memory-bound, because the AI of 250 Ops/Byte is less than the H100's ridge point of ~295 Ops/Byte.", "Compute-bound, because with 8 GFLOPs of work, the compute units will be the bottleneck."], "correct_index": 2}}, {"id": "cloud-0280", "title": "The INT8 Energy Dividend", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate energy consumption ratio between a single FP32 operation and a single INT8 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["An FP32 op consumes ~4x more energy than INT8", "An FP32 op consumes ~3.4x more energy than INT8", "An FP32 op consumes ~18x more energy than INT8", "An FP32 op consumes ~580x more energy than INT8"], "correct_index": 2}}, {"id": "cloud-0282", "title": "The Intra-Node Speed Advantage", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the approximate latencies of NVLink 4.0 within an HGX server and cross-rack InfiniBand NDR, and how do they compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink: ~500 ns, InfiniBand: ~1,000 ns", "NVLink: ~5,000 ns, InfiniBand: ~500 ns", "NVLink: ~500 ns, InfiniBand: ~5,000 ns", "NVLink: ~1 ns, InfiniBand: ~5,000 ns"], "correct_index": 2}}, {"id": "cloud-0283", "title": "The I/O-Bound Cost Fallacy", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Assuming the job is entirely bottlenecked by loading the data from storage, what is the cost of one run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$0.28", "$8,200.00", "$2.28", "$18.22"], "correct_index": 2}}, {"id": "cloud-0286", "title": "The Roofline Litmus Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What metric determines whether an H100 workload is compute-bound or memory-bound?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Total parameter count of the model", "The model theoretical peak throughput (TFLOPS)", "Arithmetic Intensity (FLOPs per Byte)", "Power efficiency in TOPS/Watt"], "correct_index": 2}}, {"id": "cloud-0288", "title": "The Quantization Memory Dividend", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "After quantizing FP16 model weights to INT8 for inference, what is the approximate reduction factor in memory usage?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 0}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x reduction", "~1.5x reduction", "~2x reduction", "~8x reduction"], "correct_index": 2}}, {"id": "cloud-0290", "title": "The Great Interconnect Divide: NVLink vs. InfiniBand", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate raw bandwidth difference between NVLink 4.0 and InfiniBand NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They have roughly the same bandwidth.", "NVLink 4.0 is about 2x faster than InfiniBand NDR.", "NVLink 4.0 is about 18x faster than InfiniBand NDR.", "InfiniBand NDR is about 4x faster than NVLink 4.0."], "correct_index": 2}}, {"id": "cloud-0291", "title": "The INT8 Inference Memory Footprint", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the bare minimum VRAM required to load a 7-billion parameter Llama model's INT8 weights for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["112 GB", "14 GB", "7 GB", "700 MB"], "correct_index": 2}}, {"id": "cloud-0295", "title": "The Arithmetic Intensity Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Will this operation's performance be primarily limited by the GPU's compute power or its memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because it's a mathematical operation on a powerful GPU.", "Memory-bound, because the ratio of compute to data movement is very low.", "Neither, it's bound by NVLink bandwidth.", "It depends entirely on the size of the vector."], "correct_index": 1}}, {"id": "cloud-0296", "title": "The Arithmetic Intensity Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this preprocessing kernel with 10 TFLOPs per 500 GB read compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 10 TFLOPs is a very large number of operations that should saturate the GPU.", "Memory-bound, because its Arithmetic Intensity of ~20 Ops/Byte is far below the H100's ridge point of ~295 Ops/Byte.", "Network-bound, because transferring 500 GB of data is the bottleneck, regardless of computation.", "It's impossible to tell without knowing the kernel's execution time in milliseconds."], "correct_index": 1}}, {"id": "cloud-0298", "title": "The Datacenter Cooling Tax", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With 8 GPUs at 700 W each and PUE 1.15, what total rack power is consumed including cooling overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5.60 kW", "4.87 kW", "6.44 kW", "7.55 kW"], "correct_index": 2}}, {"id": "cloud-0300", "title": "The 7B Inference Memory Check", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the minimum VRAM required to simply load the model's weights in FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "14 GB", "28 GB", "112 GB"], "correct_index": 1}}, {"id": "cloud-0301", "title": "The Lifecycle TCO Inversion", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over a multi-year lifecycle, how does the one-time training cost typically compare with the cumulative inference cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Training cost is dominant, typically 5-10x greater than cumulative inference cost.", "The costs are roughly equal (a 1:1 ratio).", "Cumulative inference cost is dominant, typically 5-10x greater than training cost.", "The costs are unrelated (CapEx vs. OpEx) and not directly comparable."], "correct_index": 2}}, {"id": "cloud-0302", "title": "Identifying the Roofline's Axis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which fundamental metric determines whether an H100 workload is compute-bound or memory-bound, and what is its definition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ratio of Bytes transferred per FLOP (Bytes/FLOP).", "The raw throughput of the GPU in TFLOPS.", "The power efficiency of the GPU in TOPS/W.", "The ratio of FLOPs performed per Byte of memory transferred (FLOPs/Byte)."], "correct_index": 3}}, {"id": "cloud-0304", "title": "The Data Center Rack Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With a 70 kW rack power limit and PUE 1.1, how many 700W GPUs can safely operate in the rack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 GPUs. (Calculated by 70,000W / 700W)", "110 GPUs. (Calculated by 70,000W × 1.1 / 700W)", "90 GPUs. (Calculated by 70,000W / 1.1 / 700W)", "83 GPUs. (Calculated by 70,000W / 1.2 / 700W)"], "correct_index": 2}}, {"id": "cloud-0305", "title": "The Blue-Green Pull Time", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how long does it take for a single server to download this container image over a modern datacenter network like InfiniBand NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 20 minutes", "About 1 minute", "About 12 seconds", "About 3 seconds"], "correct_index": 3}}, {"id": "cloud-0306", "title": "The RAG Pod Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Assuming both inference models use FP16 weights, what total GPU memory is needed to load the 300M embedding model and 7B LLM?", "chain_ids": ["cloud-chain-auto-016-02"], "chain_positions": {"cloud-chain-auto-016-02": 1}, "chain_tiers": {"cloud-chain-auto-016-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14.0 GB", "7.3 GB", "14.6 GB", "116.8 GB"], "correct_index": 2}}, {"id": "cloud-0307", "title": "The TCO Iceberg: TCO & Cost Modeling", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over the model's expected 3-year production lifespan, which component is most likely to dominate the Total Cost of Ownership (TCO)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The initial $1M training hardware cost.", "The electricity cost for the inference servers over 3 years.", "The salaries of the engineering team operating the service for 3 years.", "The network bandwidth costs for serving user traffic."], "correct_index": 2}}, {"id": "cloud-0309", "title": "The Ridge Point Rule", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the name of this ratio, and what fundamental bottleneck does being memory-bound signify?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's called Throughput, and it means the GPU clock speed is too low.", "It's called Arithmetic Intensity, and it means the workload is bottlenecked by memory bandwidth.", "It's called Latency, and it means the PCI-e bus is saturated.", "It's called Arithmetic Intensity, and it means the workload is bottlenecked by the number of CUDA cores."], "correct_index": 1}}, {"id": "cloud-0311", "title": "The Real-Time Transcription Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "To meet this real-time constraint, what is maximum average Time Per Output Token (TPOT) your system can have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["15 ms/token", "1000 ms/token", "67 ms/token", "15 tokens/sec"], "correct_index": 2}}, {"id": "cloud-0313", "title": "The Node vs. The Network Latency Gap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a cross-rack InfiniBand transfer compared to an on-node NVLink transfer in terms of pure latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly the same speed (within 2x)", "About 10x slower", "About 100x slower", "InfiniBand is faster than NVLink"], "correct_index": 1}}, {"id": "cloud-0319", "title": "The Cross-Rack Divide", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To connect the different server nodes together, which of the following technologies is the standard choice for the high-bandwidth, low-latency fabric?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink 4.0", "PCIe Gen5", "InfiniBand NDR", "HBM3"], "correct_index": 2}}, {"id": "cloud-0321", "title": "The Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the industry-standard rule of thumb you should use to estimate the annual hardware maintenance costs, as a percentage of the initial capital expenditure (CapEx)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.5%", "5%", "25%", "50%"], "correct_index": 1}}, {"id": "cloud-0322", "title": "The A/B Test Power Bill", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you calculate the total electricity cost for the 30-day A/B test, including PUE, and what is the final number?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$108.00", "$118.80", "$83.16", "$118,800.00"], "correct_index": 1}}, {"id": "cloud-0323", "title": "The Datacenter Cooling Tax (PUE)", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a realistic PUE value for such a facility, and what does it physically represent?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 0}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["PUE ≈ 2.0, meaning you need 1W of cooling for every 1W of compute.", "PUE ≈ 1.1, meaning you need 0.1W of cooling for every 1W of compute.", "PUE ≈ 1.0, meaning cooling is nearly free and consumes no extra power.", "PUE ≈ 0.9, meaning the cooling system generates its own power."], "correct_index": 1}}, {"id": "cloud-0325", "title": "The Memory-Bound Vision Model", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the 50-TFLOP, 200-GB forward pass on an H100 compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 50 TFLOPs is a very high operational count that will saturate the execution units.", "Compute-bound, because its Arithmetic Intensity (250 Ops/Byte) is a high number, close to the hardware's peak.", "Memory-bound, because its Arithmetic Intensity (250 Ops/Byte) is less than the H100's ridge point (~295 Ops/Byte).", "Memory-bound, because it requires 4 Bytes per FLOP, which is too much data for the memory system."], "correct_index": 2}}, {"id": "cloud-0326", "title": "The Hidden Cost of Cooling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a PUE of 1.2 affect one 700W H100's power draw, and what is its daily electricity cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$1.68 per day", "~$1.40 per day", "~$2.02 per day", "~$201.60 per day"], "correct_index": 2}}, {"id": "cloud-0327", "title": "The Blue/Green Memory Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much GPU memory is required for FP16 weights during the blue/green rollout of the 7B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "7 GB", "28 GB", "56 GB"], "correct_index": 2}}, {"id": "cloud-0329", "title": "The TCO of Privacy: Centralized vs. Federated", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How should you compare the two training architectures over the 3-year project to choose the more economical one?", "chain_ids": ["cloud-chain-auto-016-09"], "chain_positions": {"cloud-chain-auto-016-09": 1}, "chain_tiers": {"cloud-chain-auto-016-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper with a TCO of $1.5M, vs. FL at $2.3M. (Error: Calculates for only 1 year)", "Federated Learning is cheaper with a TCO of $900k, vs. Centralized at $3M. (Error: Ignores CapEx)", "Federated Learning is cheaper with a 3-year TCO of $2.9M, vs. Centralized at $3.5M.", "Centralized is cheaper with a TCO of $2.0M, vs. FL at $3.9M. (Error: Mixes up CapEx and OpEx)"], "correct_index": 2}}, {"id": "cloud-0332", "title": "The RAG Rollout Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory will this new model consume just for the model weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "56 GB", "14 GB", "28 GB"], "correct_index": 2}}, {"id": "cloud-0333", "title": "The Annual Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on industry rules of thumb, what is the approximate annual maintenance cost you should budget for this server, separate from power and operational costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$1,500", "~$4,800", "~$24,000", "~$84,000"], "correct_index": 2}}, {"id": "cloud-0334", "title": "The CapEx vs. OpEx Blind Spot", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Using the standard hardware constants, what is the approximate TCO for this pod?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$240,000", "$276,000", "$297,000", "$258,000"], "correct_index": 2}}, {"id": "cloud-0336", "title": "The PUE Tax", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does PUE represent, and what total grid power does the 8-GPU server draw with a PUE of 1.1?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 1}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5.60 kW", "0.77 kW", "6.16 kW", "6.72 kW"], "correct_index": 2}}, {"id": "cloud-0337", "title": "The On-Node vs. Off-Node Interconnect Gap", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much faster is a typical NVLink transfer compared to a cross-rack InfiniBand transfer in terms of latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x faster", "About 10x faster", "About 100x faster", "They are roughly the same speed"], "correct_index": 1}}, {"id": "cloud-0340", "title": "The Cloud vs. Hybrid TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the annual fleet cloud compute costs under Scenarios A and B, and which is more cost-effective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Scenario A is more cost-effective at $1.2 Million.", "Scenario A is more cost-effective at $8.0 Million.", "Scenario B is more cost-effective at $2.4 Million.", "Scenario B is more cost-effective at $2,400."], "correct_index": 2}}, {"id": "cloud-0344", "title": "The TCO Blindspot: Hardware Maintenance", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Using standard industry rules of thumb, what is the approximate annual maintenance cost for a single H100 server with about $30,000 CapEx?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$150", "$1,500", "$3,000", "$6,000"], "correct_index": 1}}, {"id": "cloud-0348", "title": "The Kernel Launch Overhead Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the total latency of the fused kernel after combining the 10 µs MatMul and 2 µs ReLU with one 5 µs launch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12 µs", "22 µs", "17 µs", "15 µs"], "correct_index": 2}}, {"id": "cloud-0349", "title": "The Deceptive Addition", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of C = A + B for FP16 tensors, and is it memory-bound or compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["AI is ~0.5 FLOPs/Byte, making it compute-bound. (Forgetting to include the read bytes for B and write bytes for C).", "AI is ~0.5 FLOPs/Byte, making it memory-bound. (Forgetting to include the write bytes for C).", "AI is ~0.25 FLOPs/Byte, making it memory-bound. (Calculating 1 FLOP / 4 Bytes by forgetting FP16 size for C).", "AI is ~0.167 FLOPs/Byte, making it memory-bound."], "correct_index": 3}}, {"id": "cloud-0350", "title": "The Rack Density Limit", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many H100s can a 70 kW rack support before exceeding cooling capacity, and how does typical system overhead change that number?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 1}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["80 GPUs", "100 GPUs", "700 GPUs", "10 GPUs"], "correct_index": 0}}, {"id": "cloud-0352", "title": "The OTA Update Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much storage is required for the 7B model's FP16 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "112 GB", "14 GB", "1.75 GB"], "correct_index": 2}}, {"id": "cloud-0355", "title": "The Arithmetic Intensity Litmus Test: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on the principles of the roofline model, what is the primary performance bottleneck for this workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory bandwidth, because the arithmetic intensity is far below the GPU's ridge point.", "Peak TFLOPS, because the model isn't doing enough operations to keep the cores busy.", "L2 cache latency, because the working data set doesn't fit in L1 cache.", "The power limit (TDP), because the GPU cannot draw enough power to run faster."], "correct_index": 0}}, {"id": "cloud-0356", "title": "H100: Compute or Memory Bound?", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this H100 kernel with 20 GFLOPs of work and 200 MB of HBM reads compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. Its Arithmetic Intensity is high, so performance is limited by the 989 TFLOPS of compute.", "Memory-bound. Its Arithmetic Intensity is 0.1 FLOPs/Byte, which is far below the H100's ridge point.", "Memory-bound. Its Arithmetic Intensity is 100 FLOPs/Byte, which is below the H100's ridge point of ~295 FLOPs/Byte.", "Compute-bound. Any operation performing 20 GFLOPs is inherently compute-intensive and will be limited by core speed, not memory."], "correct_index": 2}}, {"id": "cloud-0357", "title": "The Over-provisioned AI Rack", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What happens to 90 H100s in a 60 kW rack, and what sustained performance fraction can they reach versus theoretical peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The rack will run at 100% performance because each individual GPU's 700W TDP is well below the 60 kW rack limit.", "85 of the GPUs will run at 100% and the remaining 5 will be cleanly shut down to exactly meet the 60 kW budget.", "The rack is over-provisioned and will be throttled to ~95% of its peak performance.", "The rack is severely over-provisioned and will only run at about 80% of its peak performance."], "correct_index": 2}}, {"id": "cloud-0358", "title": "The RAG Fleet Update Bill", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What total data egress is billed to push a 500 MB vector index update to 10,000 vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["500 MB (Misconception: Calculates for a single device, ignoring the fleet)", "140 TB (Misconception: Calculates using the 14GB model size instead of the 500MB index size)", "5 TB", "50 TB (Misconception: Unit conversion or arithmetic error)"], "correct_index": 2}}, {"id": "cloud-0359", "title": "The CapEx Baseline", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate unit cost for a single NVIDIA H100 GPU, which forms the baseline for your Total Cost of Ownership (TCO) calculation?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 0}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$3,000 (Assuming consumer-grade hardware pricing)", "~$10,000 (Assuming previous generation A100 pricing)", "~$30,000 (Accurate H100 enterprise baseline)", "~$240,000 (Mistaking an 8-GPU HGX baseboard for a single GPU)"], "correct_index": 2}}, {"id": "cloud-0360", "title": "The Federated Learning Data Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much gradient data will 10 million users upload per week at 200 MB each, and what does that imply for backend scale?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 0}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2 Terabytes", "200 Terabytes", "2 Petabytes", "20 Petabytes"], "correct_index": 2}}, {"id": "cloud-0366", "title": "The H100 Unit Cost: TCO & Cost Modeling", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate unit cost of a single NVIDIA H100 GPU, a standard component for large-scale AI training in the cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$3,000 (Confuses datacenter GPU with a high-end consumer card)", "~$10,000 (Underestimates the premium for enterprise-grade features)", "~$30,000", "~$100,000 (Confuses the cost of a single GPU with a fully-equipped server)"], "correct_index": 2}}, {"id": "cloud-0367", "title": "The TCO of Privacy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What annual data-transfer cost would centralizing 10M users' 1MB/day create, and how much would FL avoid?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$71 per year", "$200 per year", "$73,000 per year", "$584,000 per year"], "correct_index": 2}}, {"id": "cloud-0369", "title": "The H100 Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the FP16 ridge point for an H100 GPU, and what does that number say about workload performance?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.003 Ops/Byte. This implies almost any workload is compute-bound.", "~37 Ops/Byte. This results from incorrectly using bits instead of bytes for memory bandwidth.", "~295 Ops/Byte. A workload's arithmetic intensity must exceed this to be compute-bound.", "~1,342 Ops/Byte. This is the ridge point for an edge device (Jetson AGX Orin), not a datacenter GPU."], "correct_index": 2}}, {"id": "cloud-0372", "title": "The Iceberg of Inference Costs", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When analyzing the Total Cost of Ownership (TCO) for this model, which of the following components typically contributes the most to the total cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time cost of the initial model training run.", "The salaries of the ML engineering and research team.", "Ongoing inference costs (server hosting, power, maintenance).", "The cost of data acquisition, cleaning, and labeling."], "correct_index": 2}}, {"id": "cloud-0374", "title": "The Power Efficiency Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Within a 30W TDP budget, which accelerator delivers higher sustained INT8 throughput, and what effective TOPS can it sustain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Hailo-8, because its TOPS/W efficiency is over 2x higher.", "Jetson Orin, because its peak performance (275 TOPS) is the highest.", "Jetson Orin, because at 30W it can sustain ~137 TOPS, while the Hailo-8 is capped at 26 TOPS.", "Hailo-8, because with its higher efficiency it can deliver 312 TOPS (10.4 TOPS/W * 30W) in a 30W budget."], "correct_index": 2}}, {"id": "cloud-0375", "title": "The 70kW Rack Limit", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many 700W H100 GPUs can you safely install in a 70 kW rack when PUE is 1.2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 GPUs", "120 GPUs", "83 GPUs", "87 GPUs"], "correct_index": 2}}, {"id": "cloud-0377", "title": "The RAG Rollback Reflex", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the standard, most immediate operational response to this incident?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Begin a root-cause analysis by inspecting the new data file for corruption.", "Take the entire chatbot system offline to prevent more bad answers.", "Immediately trigger an automated rollback to the previous version of the vector database.", "Fine-tune the base LLM with the new information to teach it the correct behavior."], "correct_index": 2}}, {"id": "cloud-0380", "title": "The Datacenter Heat Wave", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If cooling can dissipate only 80% of an H100's 700W TDP, what sustained FP16 throughput should you expect and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~198 TFLOPS (Calculated Trap: Calculates 20% performance instead of 80%)", "989 TFLOPS (Calculated Trap: Assumes TDP throttling does not impact throughput)", "~791 TFLOPS", "560 TFLOPS (Calculated Trap: Confuses wattage limit with TFLOPS output)"], "correct_index": 2}}, {"id": "cloud-0381", "title": "The On-Node Interconnect Hierarchy", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a transfer over the PCIe Gen5 bus compared to a direct transfer over NVLink 4.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["PCIe is ~10x slower", "They are about the same speed", "PCIe is ~2x slower", "PCIe is ~4x slower"], "correct_index": 2}}, {"id": "cloud-0382", "title": "The 5% Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on industry standards, what is the approximate annual cost for hardware maintenance, expressed as a percentage of the initial CapEx?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1%", "5%", "20%", "33%"], "correct_index": 1}}, {"id": "cloud-0383", "title": "The First-Year TCO of a Small AI Factory", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What first-year TCO should finance budget for 10 GPUs running 24/7, including CapEx, maintenance, and PUE-adjusted power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$315,000", "~$322,300", "~$336,700", "~$8,400,000"], "correct_index": 2}}, {"id": "cloud-0384", "title": "The H100's Memory Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Using the 989 TFLOPS FP16 compute and 3.35 TB/s memory bandwidth, why is the kernel memory-bound, and what is its arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound; it has nearly a PetaFLOP of compute. (Calculated trap: Ignores Arithmetic Intensity)", "Memory-bound; its Arithmetic Intensity (~0.167 Ops/Byte) is far below the ridge point (~295 Ops/Byte). (Correct)", "Compute-bound; its Arithmetic Intensity (~295 Ops/Byte) is very high. (Calculated trap: Uses ridge point as Arithmetic Intensity)", "Memory-bound; its Arithmetic Intensity is ~0.5 Ops/Byte. (Calculated trap: Ignores write byte cost)"], "correct_index": 1}}, {"id": "cloud-0385", "title": "The Guardrail Canary Cost", "topic": "activation-memory", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much additional HBM is required on canary instances to keep both 1B-parameter FP16 guardrail models loaded?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 GB", "4 GB", "2 GB", "16 GB"], "correct_index": 2}}, {"id": "cloud-0386", "title": "The Dominant Factor in TCO", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which phase typically accounts for the largest portion of the Total Cost of Ownership (TCO)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time cost of the initial model training run.", "The cumulative cost of running the inference fleet.", "The cost of data acquisition and labeling.", "The salaries of the engineering team maintaining the model."], "correct_index": 1}}, {"id": "cloud-0387", "title": "The CapEx of an A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What initial GPU CapEx is required for a three-month shadow-serving experiment that needs 100 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$30,000", "$300,000", "$3,000,000", "$150,000"], "correct_index": 2}}, {"id": "cloud-0389", "title": "The RAG Rollout Bandwidth Bill", "topic": "compound-ai-systems", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much total data must be pulled from the container registry to the nodes for this initial canary deployment?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["400 GB", "7.5 TB", "750 GB", "700 GB"], "correct_index": 2}}, {"id": "cloud-0390", "title": "The Hidden Cost of Hardware", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on industry rules of thumb, what is the approximate annual maintenance cost for a single server GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$150", "$1,500", "$7,500", "$15,000"], "correct_index": 1}}, {"id": "cloud-0393", "title": "The Cross-Rack Communication Gap", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For cross-rack GPU communication during distributed training, what is the lowest-latency interconnect and its approximate latency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink 4.0 at ~500 ns", "PCIe Gen5 at ~1,000 ns", "InfiniBand NDR at ~5,000 ns", "Cross-country Fiber at ~40,000,000 ns"], "correct_index": 2}}, {"id": "cloud-0394", "title": "The RAG Update Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can the FP16 13B RAG model plus its 4 GB vector index fit within a 32 GB container memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["26 GB. It fits easily.", "17 GB. It fits with plenty of room.", "30 GB. It fits, but with a small margin.", "212 GB. It does not fit; it requires a much larger node."], "correct_index": 2}}, {"id": "cloud-0396", "title": "The A/B Test Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the total infrastructure cost for a one-week 50/50 experiment, assuming an H100 cloud instance costs $4.50/hour?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$540 (Trap: 5 GPUs * 24 hours * $4.50 = 1 day only)", "$2,268 (Trap: 3 GPUs * 168 hours * $4.50 = Challenger only)", "$3,024 (Trap: 4 GPUs * 168 hours * $4.50 = Assumes same size fleets)", "$3,780 (Correct)"], "correct_index": 3}}, {"id": "cloud-0398", "title": "The RAG Model Upgrade", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many Gigabytes of memory will the new 7B parameter model require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "28 GB", "14 GB", "112 GB"], "correct_index": 2}}, {"id": "cloud-0402", "title": "The TCO Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What rule of thumb estimates annual hardware maintenance as a percentage of initial CapEx for a large GPU cluster?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.5%", "20%", "5%", "33%"], "correct_index": 2}}, {"id": "cloud-0403", "title": "The TCO of Privacy: Federated Learning's Compute Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If the centralized model requires 6.0 x 10^23 FLOPs to train, what is the additional compute cost the company will incur by adopting Federated Learning?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["`7.2 x 10^23` FLOPs", "`1.2 x 10^22` FLOPs", "`1.2 x 10^23` FLOPs", "`4.8 x 10^23` FLOPs"], "correct_index": 2}}, {"id": "cloud-0405", "title": "The Great Divide: Intra-Node vs. Inter-Node", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If you need to transfer a large tensor between two GPUs located within the same physical server, which interconnect is designed to provide the highest bandwidth for this specific task?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand NDR", "PCIe Gen5", "NVLink 4.0", "HBM3 Memory"], "correct_index": 2}}, {"id": "cloud-0407", "title": "The CapEx Foundation of TCO", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate unit cost for a single NVIDIA H100 GPU that you would use as the baseline for this calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$3,000", "~$300,000", "~$30,000", "~$3/hour"], "correct_index": 2}}, {"id": "cloud-0408", "title": "The TCO of an H100", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the approximate 3-year TCO for one H100, including $30k CapEx, 700W at PUE 1.2, $0.10/kWh power, and 5% maintenance?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 1}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$32,200", "~$36,340", "~$36,700", "~$2,242,000"], "correct_index": 2}}, {"id": "cloud-0409", "title": "The MatMul Roofline Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is a 4096x4096 FP16 matrix multiplication on a modern accelerator compute-bound or memory-bound, and why based on arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because accessing over 100 MB from HBM is the primary bottleneck.", "Memory-bound, because its Arithmetic Intensity is low (~8 Ops/Byte), falling below the hardware's ridge point.", "Compute-bound, because its Arithmetic Intensity (~1365 Ops/Byte) is significantly higher than the hardware's ridge point (~295 Ops/Byte).", "It's impossible to tell without knowing the GPU's cache hit rate for the operation."], "correct_index": 2}}, {"id": "cloud-0411", "title": "The TCO Iceberg: Training vs. Inference", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over a typical 3-year production ML lifecycle, what is the approximate cost ratio of cumulative inference to initial training?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Training is 5-10x more expensive than inference.", "They are roughly equal in cost (1:1 ratio).", "Inference is 5-10x more expensive than training.", "Inference is over 100x more expensive than training."], "correct_index": 2}}, {"id": "cloud-0414", "title": "The RAG Rollout Storage Tax", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What peak vector-index storage is required during a blue-green rollout replacing a 14 GB RAG index with a new 14 GB version?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "21 GB", "28 GB", "42 GB"], "correct_index": 2}}, {"id": "cloud-0415", "title": "The Hidden Cost of Federation", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "At production scale, what is the largest and most underestimated new TCO cost introduced by adopting Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Network bandwidth costs from millions of devices sending model updates.", "Compute cost of the central server that aggregates model updates.", "Sustained engineering and operational complexity to manage the distributed fleet.", "The CapEx for 100 aggregation servers."], "correct_index": 2}}, {"id": "cloud-0416", "title": "The Annual Energy Cost of an Inference Cluster", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the approximate annual energy cost for the 10 H100 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$6,132 (Calculated Trap: Ignores the 1.1 PUE overhead)", "~$15,000 (Calculated Trap: Calculates 5% annual maintenance on CapEx)", "~$6,745", "~$67,452 (Calculated Trap: Assumes $1.00/kWh instead of $0.10/kWh)"], "correct_index": 2}}, {"id": "cloud-0417", "title": "The H100 Roofline Ridge Point: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the H100's FP16 ridge point, and what does it imply for memory- vs compute-bound kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~295,000 FLOPs/Byte. This indicates most models are memory-bound.", "~0.0034 FLOPs/Byte. This indicates most models are compute-bound.", "~295 FLOPs/Byte. Algorithms with arithmetic intensity below this are memory-bound.", "~295 FLOPs/Byte. Algorithms with arithmetic intensity above this are memory-bound."], "correct_index": 2}}, {"id": "cloud-0418", "title": "The 70B Inference Footprint", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the approximate memory needed just to load the model weights for inference in half-precision (FP16)?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 0}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["70 GB", "14 GB", "140 GB", "1120 GB"], "correct_index": 2}}, {"id": "cloud-0419", "title": "The RAG Update Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the memory implications of the blue-green re-index, and how much extra storage is needed for 1M FP16 4096-d embeddings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4.1 GB", "~16.4 GB", "~8.2 GB", "~0 GB"], "correct_index": 2}}, {"id": "cloud-0421", "title": "The First-Year Cost of an H100", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the first-year TCO for one H100 running 24/7, including CapEx, maintenance, and power at $0.15/kWh with 1.1 PUE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$30,000", "~$31,500", "~$32,420", "~$32,512"], "correct_index": 3}}, {"id": "cloud-0422", "title": "The Economics of Privacy: Federated vs. Centralized Training", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "At $0.05/GB and 100 FL rounds, what are the transfer costs for centralized training versus federated learning, and which is cheaper?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$25 (FL) vs. $500,000 (Centralized)", "$10,000 (FL) vs. $500,000 (Centralized)", "$500,000 (FL) vs. $500,000 (Centralized)", "$5,000 (FL) vs. $500,000 (Centralized)"], "correct_index": 3}}, {"id": "cloud-0423", "title": "The Data Pipeline Stall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most likely bottleneck in your node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0426", "title": "The Batch Size Sweet Spot", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does throughput increase 51x with a 64x batch size increase, and at what batch size do diminishing returns begin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Throughput increases because memory bandwidth scales linearly with batch size. Diminishing returns start at batch 64.", "Throughput increases because the GPU caches the weights in SRAM. Diminishing returns start at batch 1024 when SRAM is full.", "Throughput increases because the workload shifts from compute-bound to memory-bound. Diminishing returns start at batch 128 due to KV-cache limits.", "Throughput increases by reusing weight reads from HBM, increasing arithmetic intensity. Diminishing returns start at the ridge point (~batch 295), but VRAM limits the max batch to ~128."], "correct_index": 3}}, {"id": "cloud-0427", "title": "The Small Batch Anomaly", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might a powerful CPU complete batch-1 inference for a small CNN faster than a high-end GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0428", "title": "The OOM Error", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the system OOM instantly on step 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0429", "title": "The GPU Utilization Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Where did 77% of your GPU-hours go, and what is the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0430", "title": "The Token Throughput Estimate", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "During autoregressive decoding at batch=1, roughly how many tokens/sec can a 70B LLM generate, and why is it memory-bandwidth bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0432", "title": "The Arithmetic Intensity Question", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is an H100 showing only 15% of peak TFLOPS not necessarily broken?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0434", "title": "The Mid-Afternoon Throttling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of the daily 15-20% throughput drop when GPU power is capped around 560W at 2 PM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server's Power Supply Unit (PSU) is failing and cannot provide the full 700W to the GPU.", "The training script has a software bug that reduces computational intensity after a few hours of running.", "A 'noisy neighbor' is running on the same server, stealing CPU cycles and starving the GPU of data.", "The datacenter's ambient temperature is rising in the afternoon, reducing the GPU's thermal headroom and forcing it to throttle power."], "correct_index": 3}}, {"id": "cloud-0435", "title": "The Illusion of Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the speedup so negligible?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 0}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is now memory-bandwidth bound because loading the sparse weight indices from HBM is slower than the compute savings.", "The pruning is unstructured, which prevents the H100's Tensor Cores from accelerating the matrix math and leads to inefficient, irregular memory access.", "Kernel launch overhead now dominates the execution time, as the pruned model still requires launching the same number of CUDA kernels.", "The model has been compressed on disk, but the GPU driver is decompressing it back to a dense format in memory, nullifying the pruning."], "correct_index": 1}}, {"id": "cloud-0436", "title": "The Distillation Cost-Benefit Analysis", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the cost of distilling the 175B teacher into a 7B student on 140B tokens at 40% MFU and $2/GPU-hour, and what drives it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Around $7,700, primarily driven by the student model's backpropagation steps.", "Around $193,000, because the MFU is so low that it makes the hardware inefficient.", "Around $77,100, primarily driven by the compute required for the teacher model's forward passes.", "Around $30,000, primarily driven by the HBM memory capacity needed to hold both models."], "correct_index": 2}}, {"id": "cloud-0437", "title": "The Tale of Two Latencies", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which optimizations address the 8k-token prefill latency and the chatbot time-per-token latency for the 70B LLM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply FlashAttention to the chatbot to reduce per-token time, and Speculative Decoding to the summarizer to handle the long context.", "Apply FlashAttention to the summarizer to fix prefill latency, and Speculative Decoding to the chatbot to reduce per-token latency.", "Apply FlashAttention to both; it speeds up all attention calculations, which will fix both prefill and decoding latency.", "Neither. The issue is network latency for the chatbot and an insufficient batch size for the summarizer, not the model's architecture."], "correct_index": 1}}, {"id": "cloud-0438", "title": "The Tensor Parallelism Choke Point", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long should the 4GB all-reduce take with all 8 GPUs on NVLink versus split across two servers, and what bottleneck does this reveal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The bottleneck is PCIe Gen5 bandwidth, as the data must cross the PCIe bus taking ~220ms (calculating 7GB / 32GB/s).", "Software overhead in the NCCL communication library is the primary issue; the hardware difference between NVLink and InfiniBand is mathematically negligible.", "The bottleneck is the InfiniBand NDR link, which has ~18x less bandwidth (50 GB/s vs 900 GB/s). The inter-node transfer takes ~140ms vs ~7.8ms for the all-NVLink case.", "The bottleneck is HBM memory access latency; reading the 4 GB tensor from HBM on each GPU at 3.35TB/s is slower than the network transfer itself."], "correct_index": 2}}, {"id": "cloud-0439", "title": "The Silent Failure Cascade", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes wasted time for the 4,096-GPU, 25-day 175B LLM training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Checkpoint every 5 minutes to be safe.", "Checkpoint roughly every 18-20 minutes.", "Checkpoint once per day to minimize overhead.", "The job failed after 3 days, so checkpointing every 48 hours is sufficient."], "correct_index": 1}}, {"id": "cloud-0441", "title": "The GPU-Bound Inference Stall", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this poor performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too small to saturate the GPU's arithmetic units.", "The PCIe Gen5 bus is a bottleneck, preventing data from reaching the GPU quickly enough.", "The system is dispatch-bound due to high kernel launch overhead from numerous small operations.", "The model is memory-bandwidth bound because it needs to read large embedding tables from HBM."], "correct_index": 2}}, {"id": "cloud-0442", "title": "The Long-Context OOM Failure", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given that the model parameters (140GB in FP16, distributed via Tensor Parallelism) and optimizer states fit in memory, what is the most likely cause of this sudden OOM error?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Adam optimizer states have doubled in size due to the longer sequence.", "The full (N, N) attention score matrix is being materialized in HBM, which scales quadratically.", "The KV-cache for the 8192-length context is too large to fit in memory.", "The gradient checkpointing buffer is overflowing with the larger activation sizes."], "correct_index": 1}}, {"id": "cloud-0444", "title": "The Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause for these diminishing returns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large, causing PCIe bus saturation as data is swapped to system RAM.", "The All-Reduce collective operation is saturating the inter-node InfiniBand network.", "The training workload is compute-bound, and we have hit maximum TFLOPS.", "NVLink bandwidth is insufficient for the amount of intra-node gradient sharing required."], "correct_index": 1}}, {"id": "cloud-0445", "title": "The Embedding Lookup Lag", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which bus protocol is the critical path and what is its approximate latency for a single lookup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand RDMA, with a latency of ~5,000 ns per lookup. (Calculated trap: Assumes inter-node routing)", "PCIe Gen5, with a latency of ~1,000 ns per lookup. (Calculated trap: Assumes non-NVLink traversal)", "NVLink, with a combined bus and remote HBM access latency of ~800 ns per lookup. (Correct)", "HBM3 memory bandwidth. (Calculated trap: Assumes bandwidth-bound instead of latency-bound)"], "correct_index": 2}}, {"id": "cloud-0446", "title": "The Silent GPU Killer", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most likely cause of this failure, and what should be your immediate action?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The training code has entered an infinite loop, causing the application to hang. You should attach a debugger to the Python process.", "The data loading pipeline is stuck, starving the GPU and freezing the process. You should investigate the data loader and network performance.", "A transient GPU hardware fault occurred, confirmed by the `dmesg` error. You should ensure the job automatically restarts from its last checkpoint on a different node.", "The periodic checkpointing process is hanging while writing to the file system, which freezes the training loop. You should investigate the storage system's health."], "correct_index": 2}}, {"id": "cloud-0447", "title": "The CISO vs. The CFO: Federated TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using ALE, should the bank choose a $650k centralized 70B training run or federated learning if centralization raises $100M breach risk from 1% to 5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The $650,000 cost per training run is the dominant factor, as frequent retraining could exceed millions per year.", "The engineering cost of building and maintaining a complex Federated Learning pipeline will be the highest cost.", "The $4M increase in Annualized Loss Expectancy from centralizing the data is the most significant financial factor.", "The network egress cost to transfer petabytes of data from partner banks to the central cloud will be the largest one-time expense."], "correct_index": 2}}, {"id": "cloud-0448", "title": "The Satellite Imagery Scaling Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 4096x4096 satellite imagery, should you choose a ConvNet or a standard ViT, and how large is the scaling gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT, as it is a more modern architecture that generally outperforms ConvNets, indicating superior feature learning capabilities.", "Both are comparable; since they have a similar number of parameters, their inference FLOPs and serving cost will be roughly the same.", "The ConvNet, because the ViT's attention mechanism scales quadratically with the number of patches, leading to an intractable explosion in compute at high resolutions.", "The ConvNet, because it will have higher arithmetic intensity and better saturate the GPU's memory bandwidth compared to the ViT."], "correct_index": 2}}, {"id": "cloud-0450", "title": "The PCIe Starvation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most probable architectural bottleneck causing the GPU to starve?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVMe SSD array cannot provide data fast enough to the CPU.", "The bandwidth of the PCIe Gen5 bus is insufficient to keep the GPU's memory fed with data.", "The number of CPU workers in the DataLoader is too low, causing a preprocessing bottleneck.", "The GPU's L2 cache is too small, causing frequent, slow misses to HBM."], "correct_index": 1}}, {"id": "cloud-0451", "title": "The Privacy-TCO Trade-off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year TCOs for centralized versus federated training across 10 hospitals with 50TB each, and which should you recommend?", "chain_ids": ["cloud-chain-auto-016-09"], "chain_positions": {"cloud-chain-auto-016-09": 2}, "chain_tiers": {"cloud-chain-auto-016-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper because its one-time compute cost ($80k) is far less than the federated hardware cost ($800k).", "Federated is cheaper primarily because it avoids the massive recurring cost of the specialized compliance team and long-term cloud storage.", "Centralized is cheaper because the data egress cost for federated learning ($1,500) will grow to be the largest expense over time.", "They are roughly equivalent in cost; the higher hardware CapEx of the federated approach is offset by the higher compute cost of the centralized one."], "correct_index": 1}}, {"id": "cloud-0452", "title": "The Startup's Scaling Dilemma", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 1T tokens and 1M H100-hours, should the startup train the 2B model or the 25B model under Chinchilla scaling?", "chain_ids": ["cloud-chain-auto-secondary-013-03"], "chain_positions": {"cloud-chain-auto-secondary-013-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 25B model, because its dense matrix multiplications will achieve higher MFU on H100s, making more efficient use of the grant.", "The 25B model, because it is closer to the data-optimal size for 1T tokens and easily fits within the compute budget.", "The 2B model, because the 25B model is too large for the 1T token dataset, making it data-constrained and leading to wasted compute.", "The 2B model, because smaller models require fewer FLOPs per parameter, allowing us to train for more epochs."], "correct_index": 1}}, {"id": "cloud-0453", "title": "The Single-Node Slowdown", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the 14GB inter-GPU transfer taking about 220ms on the 8-H100 DataParallel server?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is bottlenecked on data preprocessing, starving the GPUs.", "The InfiniBand network connection to other nodes is saturated.", "The server lacks a direct NVLink bridge, forcing GPU communication over the slower PCIe bus.", "The H100 HBM3 memory bandwidth is insufficient for the model size."], "correct_index": 2}}, {"id": "cloud-0454", "title": "The Scaling Efficiency Collapse", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does switching from 400Gbps InfiniBand to 100Gbps Ethernet balloon step time from 11.3s to over 27s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cluster's storage (NVMe) is too slow for writing checkpoints at each step.", "The PCIe bus on each node is saturated from transferring data to the network card.", "The Ethernet cluster lacks RDMA, forcing slow, CPU-mediated data transfers for gradient synchronization.", "The 4x reduction in raw bandwidth strictly forces a 4x increase in total step time."], "correct_index": 2}}, {"id": "cloud-0455", "title": "The HealthTech TCO Dilemma", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which smart-reply strategy has lower annual TCO after adding HIPAA breach ALE: centralized cloud training or federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because the $288k compute cost is far less than the extra $500k in engineering salaries for FL.", "Federated, because data egress costs to upload petabytes of data from 100 hospitals would exceed the engineering overhead.", "Federated, because the Annual Loss Expectancy from a potential data breach in the centralized model makes it significantly more expensive.", "Centralized, because FL models converge slower and have lower accuracy, leading to hidden opportunity costs in product quality that outweigh the breach risk."], "correct_index": 2}}, {"id": "cloud-0456", "title": "The SLA-Driven Batching Strategy", "topic": "activation-memory", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is large static batching a poor solution for this unified service, and what scheduling approach is better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's a good solution; the chatbot users will just have to tolerate higher latency.", "It fails because large static batches can exhaust HBM, causing frequent swapping.", "It fails because high-latency chatbot requests will be starved by the throughput-focused batch jobs. A better solution is continuous batching with priority scheduling.", "It's better to build two separate physical clusters, one for each workload, to guarantee isolation."], "correct_index": 2}}, {"id": "cloud-0457", "title": "The Intra-Node Scaling Failure", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does scaling from 4 to 8 H100s in one server yield only a 1.3x speedup when all-reduce is slow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the server to storage is saturated.", "The GPUs are communicating over the PCIe bus instead of the NVLink fabric.", "The HBM3 memory on each GPU doesn't have enough bandwidth to handle the gradients.", "The ring all-reduce algorithm is inefficient and should be replaced with a tree all-reduce."], "correct_index": 1}}, {"id": "cloud-0458", "title": "The Mysterious Multi-Node Slowdown", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause for this drastic inter-node communication slowdown?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The intra-node NVLink bandwidth is insufficient to feed the InfiniBand NIC. (NVLink trap)", "RDMA has failed, forcing communication to use a slow, CPU-bound IP-over-InfiniBand fallback.", "The 400 Gbps InfiniBand switch lacks the capacity for this model size. (Raw BW trap)", "The PCIe bus connecting the InfiniBand NIC to the motherboard is saturated. (PCIe trap)"], "correct_index": 1}}, {"id": "cloud-0459", "title": "The Hospital TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Over three years, which option has lower TCO for quarterly medical-imaging retraining: centralized cloud training or federated on-prem training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Path A is cheaper because the hospital avoids the large $500,000 upfront server cost (CapEx).", "Path B is cheaper because the 3-year operational costs are significantly lower than the recurring cloud rental fees, easily justifying the initial CapEx.", "Path B is more expensive because the cost of electricity and maintenance for 10 servers over 3 years exceeds the cost of renting GPUs.", "The costs are roughly equivalent, so the decision should be based purely on data privacy concerns, not economics."], "correct_index": 1}}, {"id": "cloud-0460", "title": "The Real-Time Voice Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can the hardware meet the 350ms TTFT target for a 200-token prompt on a 7B LLM, and what are the prefill and first-decode times?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the 7B model's prefill computation will take over 500ms.", "Yes, the total TTFT including overheads and GPU processing is well under 100ms.", "Maybe, it depends entirely on the batch size the server is currently processing.", "No, reading 14GB of model weights from HBM will violate the 350ms deadline."], "correct_index": 1}}, {"id": "cloud-0461", "title": "The Two-Node Scaling Disappointment", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely physical bottleneck causing the scaling efficiency to drop off between two 8xH100 nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus on each server is saturated from transferring data to the network card.", "The chosen ring-based All-Reduce algorithm is inefficient for a two-node setup.", "The physical bandwidth of the inter-node InfiniBand connection is ~18x lower than the intra-node NVLink fabric.", "There is not enough HBM3 memory on the GPUs to store the gradients before communication."], "correct_index": 2}}, {"id": "cloud-0462", "title": "The Federated TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using TCO including the 1% chance of a $50M breach, should the fintech choose centralized training or federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Federated Learning, because the $50M potential fine is an unacceptable business risk that outweighs any calculated TCO difference.", "Federated Learning, because its TCO of $1,800,000 provides total insurance against a $50M fine. The engineering cost is a one-time capital expense and shouldn't be in the TCO calculation.", "Centralized Training, because its TCO of $810,000 is significantly lower than the Federated Learning TCO of $1,800,000.", "Centralized Training, as its TCO is lower. The primary cost driver for Federated Learning is the high energy consumption from on-device training across millions of phones."], "correct_index": 2}}, {"id": "cloud-0463", "title": "The Phantom PCIe Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of this `All-Reduce` bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the server to the storage cluster is saturated with checkpointing traffic.", "The server's CPU is too slow to orchestrate the high-frequency gradient exchange between the 8 GPUs.", "The communication library (NCCL) is misconfigured and is routing traffic over the PCIe bus instead of NVLink.", "The HBM3 memory on the GPUs is too slow to read the gradients before the `All-Reduce` operation can begin."], "correct_index": 2}}, {"id": "cloud-0465", "title": "The Privacy Premium", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year TCOs for centralized versus federated training across 10 hospitals, and which strategy would you recommend?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Federated is cheaper. The $72,000 annual compute cost for Centralized training over 3 years is more than the cost of maintaining the FL system.", "Centralized is far more expensive. The data transfer cost for 100TB of data is over $1M alone, making it prohibitive.", "Centralized TCO is ~$944K and Federated TCO is ~$1.42M. While Centralized is cheaper, the unquantifiable risk of a data breach makes Federated the only responsible business decision.", "The TCOs are comparable. Centralized is ~$1.4M, while Federated is ~$1.5M. We should choose Centralized to get better model performance."], "correct_index": 2}}, {"id": "cloud-0466", "title": "The A/B Test TCO Trap", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can you demonstrate which option has the lower Total Cost of Ownership (TCO) for this two-model training run?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0467", "title": "The Two-Node Scaling Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What hardware bottleneck explains the poor scaling when moving from 8 H100s on one node to 16 H100s across two nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU compute is saturated; the model is too small for 16 GPUs.", "The PCIe Gen5 bus connecting the GPUs to the CPU is saturated by the increased data parallelism.", "The inter-node InfiniBand connection has become the primary bottleneck for the data-parallel all-reduce.", "The intra-node NVLink bandwidth is insufficient for the 8-way tensor parallel collectives."], "correct_index": 2}}, {"id": "cloud-0468", "title": "The Multi-Node Scaling Cliff: Collective Communication", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the primary bottleneck limiting the 16-node training job's scaling efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from transferring activations between the CPU and GPU memory.", "HBM3 memory bandwidth on the GPUs is insufficient to keep the Tensor Cores fed during computation.", "The inter-node InfiniBand fabric is saturated by the 140 GB gradient All-Reduce step.", "The CPU is bottlenecking the data loading pipeline, starving all GPUs of new batches."], "correct_index": 2}}, {"id": "cloud-0470", "title": "The Tensor Parallel Scaling Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bottleneck is causing the 4GB tensor all-reduce to dominate when scaling from 8 to 16 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus connecting the GPUs to the motherboard is saturated by the increased traffic.", "The all-reduce algorithm in the framework's code is inefficient and doesn't scale beyond 8 GPUs.", "The inter-node InfiniBand network has much lower bandwidth than the intra-node NVLink fabric, creating a communication bottleneck.", "The GPUs are compute-bound and cannot keep up with the data from 16 parallel processes."], "correct_index": 2}}, {"id": "cloud-0471", "title": "The Tensor Parallelism Traffic Jam", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely diagnosis for this performance issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting to storage is saturated, delaying gradient accumulation.", "The GPUs are communicating over the PCIe bus instead of NVLink due to a system misconfiguration.", "The model's activations are too large, causing slow data movement from HBM to the streaming multiprocessors.", "The NCCL All-Reduce algorithm is misconfigured, using an inefficient Ring protocol instead of a Tree."], "correct_index": 1}}, {"id": "cloud-0475", "title": "The Inter-Node Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What primary communication bottleneck explains the 55% scaling efficiency and saturated inter-server links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from transferring gradients between the CPU and the GPUs.", "The NVLink switch within each server is overloaded by the 8-GPU all-reduce traffic.", "The InfiniBand network bandwidth between servers is insufficient for inter-node gradient synchronization.", "The server's CPU is unable to schedule the RDMA (Remote Direct Memory Access) operations fast enough."], "correct_index": 2}}, {"id": "cloud-0479", "title": "The Cross-Rack Embedding Fetch", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum latency for GPU_A to RDMA-read GPU_B’s remote embedding, and which component dominates it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The HBM3 memory access on GPU_B (~300 ns), because accessing off-chip memory is fundamentally slow.", "The request is processed by the CPU on both servers, adding significant software overhead.", "The round trip over the InfiniBand network (~8,000 ns), as it involves two cross-rack transfers.", "The speed of light delay across the datacenter floor, which is on the order of milliseconds."], "correct_index": 2}}, {"id": "cloud-0480", "title": "The Two-Node Scaling Wall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Diagnose the most likely cause of this poor scaling when the 70B LLM's gradient All-Reduce crosses the 400 Gbps InfiniBand link.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVLink switch within each server is saturated by the 8-GPU All-Reduce traffic, creating a bottleneck.", "The PCIe Gen5 bus connecting the GPUs to the CPU is unable to handle the gradient data from 8 GPUs simultaneously.", "The 400 Gbps InfiniBand link between the two servers has insufficient bandwidth for the 280 GB cross-node gradient synchronization.", "TCP/IP protocol overhead on the InfiniBand network is adding too much latency to the All-Reduce operation."], "correct_index": 2}}, {"id": "cloud-0481", "title": "The Privacy vs. Churn Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which Smart Reply approach—centralized or federated—should you choose for 10 million users, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Option A (Centralized), because the $350k compute cost is much lower than the $900k engineering headcount for Option B.", "Option B (Federated), because the $1M data breach risk of the centralized model is an unacceptable business liability.", "Option A (Centralized), because the annual cost from user churn in the federated model (~$3M) is the single largest expense and far outweighs its privacy benefits.", "Option B (Federated), because it avoids centralizing PII, and the $5 LTV is too small to worry about a minor churn increase."], "correct_index": 2}}, {"id": "cloud-0482", "title": "The Multi-Node Scaling Ceiling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause limiting the scaling performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus on each node is saturated transferring data between the host CPU memory and the GPU.", "The training is compute-bound; the H100 GPUs are simply not powerful enough to handle a 175B model efficiently.", "The gradient all-reduce step is saturating the InfiniBand interconnect between nodes.", "There is a software deadlock in the NCCL communication library when using more than 8 nodes."], "correct_index": 2}}, {"id": "cloud-0486", "title": "The Two-Node Scaling Cliff: Collective Communication", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given the two servers are connected via a 400 Gbps InfiniBand NDR link, what is the most likely bottleneck causing this scaling cliff?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 2}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated transferring data between the host CPU memory and the GPUs.", "The InfiniBand network connecting the two servers has become the primary bottleneck.", "The NVLink fabric within each server is overloaded by the 16-GPU All-Reduce traffic.", "The NCCL All-Reduce algorithm is implemented inefficiently and is not optimized for 16-GPU configurations."], "correct_index": 1}}, {"id": "cloud-0488", "title": "The Cross-Node All-Reduce Stall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What primary communication bottleneck causes the 1.5x speedup when scaling the 175B model from one NVLink node to two InfiniBand nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from loading data for the next step.", "The NVLink 4.0 bandwidth within each node is insufficient for 8 H100s during the All-Reduce.", "The InfiniBand NDR network connecting the two nodes is the bottleneck.", "The model is simply too large, and the total gradient size exceeds what modern interconnects can handle efficiently."], "correct_index": 2}}, {"id": "cloud-0490", "title": "The Disappointing Scaling Factor", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the 70B data-parallel job get only 1.2x speedup when All-Reduce crosses two InfiniBand-connected H100 servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus in each server is saturated from copying gradients between the GPUs and the CPU's main memory.", "The InfiniBand network connecting the two servers has insufficient bandwidth for the 140 GB gradient synchronization required at each step.", "The NVLink switch within each 8-GPU server is the bottleneck, as it cannot handle the All-Reduce traffic from 8 H100s simultaneously.", "The model is too large. Using a smaller 7B parameter model would resolve the communication bottleneck without changing hardware."], "correct_index": 1}}, {"id": "cloud-0491", "title": "The Hospital AI Rollout: Centralized vs. Federated TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Over 3 years, which strategy is more economical for 500 hospitals refreshing 10 TB each annually, centralized or federated training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper. The 3-year TCO is $1.5M ($500k/yr * 3) vs. Federated's $1.45M, because data costs are negligible.", "Federated is cheaper. The 3-year TCO is ~$1.45M compared to the Centralized TCO of >$4.0M.", "Centralized is cheaper. The 3-year TCO is $1.6M ($500k compute + $100k ingress) vs. Federated's $1.45M, ignoring storage costs.", "Federated is cheaper, but only because it avoids the $3.6M ingress fee (10 TB * 500 * 3 years * $0.02/GB)."], "correct_index": 1}}, {"id": "cloud-0492", "title": "The Distributed Training Stall", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bottleneck explains the 4-node AllReduce slowdown for the 175B FP16 job on 400 Gbps InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated transferring data between the host CPU memory and the GPUs.", "The on-node NVLink 4.0 interconnect is the bottleneck, as 8-way tensor parallelism is too communication-intensive.", "The 400 Gbps InfiniBand interconnect is saturated by the 350 GB gradient synchronization during the AllReduce operation.", "The training is CPU-bound, as the CPUs on each node cannot schedule the NCCL kernels for the AllReduce operation fast enough."], "correct_index": 2}}, {"id": "cloud-0494", "title": "The Tensor Parallel Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What primary bottleneck explains the 420ms step time and 90% all-reduce overhead when scaling tensor parallelism to two nodes?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 1}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The `all-reduce` operation hits the InfiniBand bandwidth limit (50 GB/s), adding ~3.6 seconds of network wait per step.", "The `all-reduce` operation hits the PCIe Gen5 bus limit (64 GB/s), adding ~2.8 seconds of network wait per step.", "The `all-reduce` operation hits the InfiniBand bandwidth limit (400 GB/s), adding ~0.45 seconds of network wait per step.", "The `all-reduce` operation hits the NVLink bandwidth limit (900 GB/s), adding ~0.2 seconds of network wait per step."], "correct_index": 0}}, {"id": "cloud-0497", "title": "The Privacy vs. Profit A/B Test: Differential Privacy", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you decide between the A/B-tested cloud and federated keyboard models when accounting for cost, privacy risk, and churn?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Ship the Cloud model. The annual cost is only $876k, which is cheaper than the $1M lost to churn from the Federated model, and the 15% engagement lift is a huge business win.", "Ship the Federated model. It has a TCO of $0 since it runs on user devices, making it infinitely cheaper than the Cloud model which costs $876k per year.", "Reject the Cloud model on privacy grounds despite its lower TCO ($876k vs $1M). The risk of a data scandal is too high. The team must optimize the Federated model's power usage before launch.", "Reject both. The Cloud model's TCO is over $8M (25 GPUs * $30,000 CapEx * 10 for infrastructure) and the Federated model loses $1M. Neither is financially viable."], "correct_index": 2}}, {"id": "cloud-0505", "title": "The Profiling Crisis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is your tech lead wrong to suggest buying a faster GPU when achieving 120 TFLOPS out of 989 peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0506", "title": "The Silent Padding Tax", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is destroying your performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0509", "title": "The Profiler Trace Puzzle", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the overall GPU compute utilization, and where should you optimize first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0511", "title": "The FSDP vs DDP Memory Trade-off", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which engineer is correct regarding the use of DDP versus FSDP for a 7B model, and what are the exact memory numbers?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 1}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0514", "title": "The Distributed Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the data loading bottleneck when scaling to 256 GPUs on a shared NFS server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0515", "title": "The Backpressure Cascade", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is happening, and how do you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0516", "title": "The 100 TB Data Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the 100 TB preprocessing pipeline, and what end-to-end time should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0517", "title": "The Data Pipeline Determinism Trap", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What standard PyTorch `DataLoader` argument completely destroys your random seed determinism, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0526", "title": "The Preemption Penalty", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is dynamic preemption via PCIe KV-cache swapping feasible for a 13B model with a 100ms P99 SLO, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0531", "title": "The Sequence Parallelism Necessity", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is tensor parallelism alone insufficient, and what additional technique do you need?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0534", "title": "The Megatron-LM Tensor Parallelism", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is $T=16$ likely slower than $T=8$, and what is the exact communication cost per transformer layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0535", "title": "The TCP/IP CPU Overhead", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the CPU doing that is bottlenecking a 100 Gbps network during the AllReduce phase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0536", "title": "The GPU Scheduling Dilemma", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you schedule these to maximize cluster utilization without starving any team?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0537", "title": "The Data Quality Pipeline", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a data quality pipeline that catches silent corruptions in the 10 TB/day training feed within 1 hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0538", "title": "The Roofline Across Precisions", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Calculate the ridge point for each precision and explain why a workload that is compute-bound at FP16 can become memory-bound at INT8.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0539", "title": "The FlashAttention Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How can you get faster without doing less math?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0540", "title": "The NUMA Node Cross-Talk", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What physical motherboard bottleneck are you hitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0544", "title": "The Multi-Modal Prefill Stall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the high-resolution image processing so slow, and what fundamental compute trade-off is the PM missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0545", "title": "The Multi-Tenant Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is adding GPUs the wrong fix for Premium timeouts behind Standard jobs, and what architecture would prevent this priority inversion?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0546", "title": "The Speculative Decoding Trap", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does this optimization, designed to make things faster, cause a catastrophic throughput collapse under load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0547", "title": "The Speculative Backfire", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What was wrong with the speculative decoding rollout, and why does multi-turn code generation see worse time-per-token?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0548", "title": "The Power Efficiency Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why might deploying this 20% more power-efficient kernel be a catastrophic decision for your H100 fleet's throughput-oriented workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0550", "title": "The Speculative Decoding Backfire: Speculative Decoding", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an optimization that helps the average case cause a non-linear latency explosion for the worst case?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0553", "title": "The Roofline Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How is that physically possible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0554", "title": "The Amdahl Ceiling", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Where did the other 480x of our hardware investment go?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0555", "title": "The Context Parallelism for Long Sequences", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you distribute the sequence across GPUs, and what's the communication pattern?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0557", "title": "The InfiniBand Adaptive Routing Loop", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What did Adaptive Routing do to the packets to break NCCL?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 3}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0558", "title": "The Quantization Bias Amplifier", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How did quantization amplify a bias that barely existed in the original model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0559", "title": "The Privacy Throughput Cliff", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does adding privacy guarantees have such a devastating systems cost, and how do you bring it down to something feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0560", "title": "The New Hotness vs. The Incumbent", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Should you invest in a new cluster of Prometheus-1 chips given their 5x peak compute, 2x memory bandwidth, and 3x price premium over H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0576", "title": "The Quantization Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What platform architecture makes INT8 quantization robust to distribution shift instead of relying on one-off calibration fixes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0577", "title": "The Silent NaN Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What three-tiered platform design would make one-click INT8 quantization safe for the 70B financial LLM and future models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0578", "title": "The Blackwell Bet: A $100M GPU Upgrade Decision", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you evaluate the $100M fleet upgrade beyond headline TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0580", "title": "The Billion-Frame Quantization Strategy", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three architectural proposals, what data do you need to collect to de-risk them, and how would you prove safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0581", "title": "The Blackwell Bet: A Datacenter Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you evaluate whether B200s or more H100 capacity can actually halve the 200B world-model training time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0582", "title": "The Quantization Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What strategy would you propose to get 2x throughput from INT8 while managing outlier activations and production distribution shift?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 3}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0583", "title": "The Hypersensitive Hyperscaler", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you formulate a quantization strategy and hardware configuration that guarantees zero accuracy loss while minimizing cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0585", "title": "The Collapsed Dynamic Range", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three architectural proposals, and how do you justify them quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0586", "title": "The Blackwell Bet: Justifying a Datacenter Upgrade", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What data-driven framework would you use to decide whether to replace the 1,000 H100s with B200s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0587", "title": "The Silent Quantization Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What three-step plan would you propose to diagnose, mitigate, and solve the INT8 failure for power users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0588", "title": "The AI Startup's Cost of Goods Sold Crisis", "topic": "activation-memory", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What phased plan would you use to cut the 70B coding assistant's inference cost per user by 50% without noticeable quality loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0589", "title": "The Mixture-of-Experts Quantization Collapse", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design a quantization strategy that works, and what specific architectural failure explains the initial collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0590", "title": "The Catastrophic Quantization Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the most likely root cause of the INT8 collapse, how would you prove it experimentally, and what quantization strategy would you deploy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0591", "title": "The B200 Fleet Upgrade Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Would you recommend a full B200 upgrade for the fleet, or a heterogeneous fleet, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0593", "title": "The Silent Overflow Catastrophe: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What safety-first quantization pipeline would you build after the INT8 fraud LLM passes offline tests but fails on production outliers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0595", "title": "The CPU-GPU Data Transfer Tax", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What happened during that first request, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0596", "title": "The Prefetch Buffer Sizing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might setting num_workers=32 make things worse, and what's the right way to size the prefetch pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0597", "title": "The Adam Memory Multiplier", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory does the optimizer alone consume, and what fraction of VRAM does it take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0598", "title": "The Decode Bandwidth Demand", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 13B FP16 model at 8k context on an A100, how many bytes are read per decode token and what tokens/sec does that imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0600", "title": "The Stuttering Training Loop", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is a common, non-obvious reason for this low utilization and stuttering behavior?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0601", "title": "The Beam Search Memory Explosion", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does Beam Search destroy your concurrency scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0603", "title": "The PCIe Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 14 GB transfer over PCIe Gen4 x16 take, and will the cold start meet the 5-second SLA?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 0}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0607", "title": "The HBM3e Bandwidth Ceiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which workloads actually benefit from the extra bandwidth, and which don't?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0608", "title": "The NUMA Penalty", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What physical topology problem is causing the 40% drop in data loading throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0609", "title": "The Activation Recomputation Trade-off", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the overhead higher than expected?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0611", "title": "The Cache Line Waste", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What fraction of each cache line fetch is wasted, and how does this compare to row-major iteration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0612", "title": "The Gradient Checkpoint Trade-off", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 32-layer transformer, how much memory does gradient checkpointing every k layers save, and what compute overhead does it add?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 2}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0613", "title": "The Striding Stumble", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the matrix[i][j] inner-loop version 3–5x faster than matrix[j][i] despite doing the same arithmetic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0614", "title": "The Prompt Caching Optimization", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Quantify the compute and memory savings of caching the system prompt, and justify if it is worth the engineering effort.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0616", "title": "The AMD MI300X Memory Advantage", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this true, and what's the real systems impact of going from 2 GPUs to 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0617", "title": "The Leaking Inference Server", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is leaking and how do you find it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0618", "title": "The Energy-Movement Invariant", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did pruning 50% of weights halve MACs but barely reduce node energy consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0619", "title": "The Strided Memory Fetch", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does Version B run 30x slower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0620", "title": "The HBM vs SRAM Bandwidth Gap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did a seemingly small change in where a tensor is stored yield a 5x speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0621", "title": "The CXL Memory Tier", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can CXL memory avoid tensor parallelism for the 140 GB FP16 70B model, and what performance impact would it have?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 2}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0622", "title": "The NUMA Nightmare", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What technical architectural detail of modern multi-socket systems is most likely causing this, and how would you diagnose and mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0623", "title": "The Memory Dilemma", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a long-context Transformer accelerator, would you choose HBM or GDDR6, and what trade-offs justify that choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0624", "title": "The GPU Memory Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "We have 80% free memory — where did it go?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0625", "title": "The KV-Cache Fragmentation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where did the missing 19GB of VRAM go?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0626", "title": "The PagedAttention Block Size Trap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did a large block size destroy your capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0628", "title": "The Embedding Table Sharding Problem", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are the 64 GPUs 80% idle when training a 1 TB sharded embedding table if the network is not saturated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0629", "title": "The Embedding Hotspot", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What happened on GPU 14?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0630", "title": "The Bandwidth Wall", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What shared resource are they fighting over?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0631", "title": "The Disaggregated Memory Architecture", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For which ML workloads does CXL memory make sense, and for which is it a trap?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 3}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0632", "title": "The Paging Paradox", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the underlying cause of this performance bottleneck, and what specific operating system feature would you leverage to mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0633", "title": "The Phantom Update", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the most likely underlying hardware-level issue causing the degradation, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0634", "title": "The Fragmentation Crisis", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is consuming our VRAM invisibly, and how do we fix it?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 4}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0635", "title": "The Gradient Checkpointing Boundary", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "With ZeRO-3 on 8x 80GB GPUs, what is the actual per-GPU memory budget for training a 70B model, and where does it break?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 4}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0636", "title": "The INT8 Throughput Advantage", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is INT8 inference always 2× faster than FP16 because the data is half the size, and when is that wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0637", "title": "The GGUF Quantization Ladder", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For Llama-3 70B on a 24 GB RTX 4090, how do Q4_K_M, Q5_K_M, and Q8_0 trade off size, throughput, quality, and fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0638", "title": "The FP16 vs BF16 Question", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do people use BF16 instead of FP16 for training if both formats are 16 bits?", "chain_ids": ["cloud-chain-auto-014-13"], "chain_positions": {"cloud-chain-auto-014-13": 0}, "chain_tiers": {"cloud-chain-auto-014-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0639", "title": "The Underflow Crisis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What numerical property is failing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0640", "title": "The Half-Baked Speedup", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's a likely technical explanation for this discrepancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0641", "title": "The Quantized Serving Accuracy Trade-off", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Walk me through the real trade-offs — when does the accuracy drop actually matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0642", "title": "The Latency Budget Breach", "topic": "mlops-lifecycle", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you approach optimizing it for this strict latency constraint without a complete re-architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0643", "title": "The Quantization Error Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For INT4 group-wise quantization with group size 128, what is the worst-case error per group and when does it become catastrophic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0644", "title": "The Mixed-Precision Training Instability", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's happening, and how do we fix it without giving up FP8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0645", "title": "The CUDA Upgrade Regression", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can the same weights produce different accuracy with a different CUDA version?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0646", "title": "The FP8 Underflow Crash", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What numerical physics destroyed your training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0647", "title": "The FP8 Training Frontier", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does FP8 work for the large model but fail for the small one?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0648", "title": "The Quantization Noise Floor", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the SQNR for INT8 quantization of Gaussian weights with \\sigma=0.02, and when do additional quantization levels stop helping?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0649", "title": "The Model Compression Pipeline", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compress the 70B FP16 model to hit $0.40 per 1M tokens on one GPU while keeping quality loss under 2%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0650", "title": "The FP16 Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is happening at step 50k that wasn't happening at step 1k?", "chain_ids": ["cloud-chain-auto-014-13"], "chain_positions": {"cloud-chain-auto-014-13": 2}, "chain_tiers": {"cloud-chain-auto-014-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0651", "title": "The Precision Trade-off", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Assuming we are compute-bound, what hardware architectural detail did we forget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0652", "title": "The Fine-Tuning Estimate", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you estimate the total cost and GPU-hours required to fine-tune Llama-2-13B on a 1M-example dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0653", "title": "The Inference Batch Size Sweet Spot", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use roofline analysis to find the batch-size sweet spot for the 7B LLM on one H100 under the <500ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0654", "title": "The NCCL NVLink Deadlock", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 7-GPU topology fail and hang on a DGX machine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0655", "title": "The NVLink PCIe Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why will Vendor A's server fail catastrophically at 8-GPU Data Parallel training compared to Vendor B?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0657", "title": "The Tokenizer Overhead Spikes", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What CPU-bound process is blocking the GPU from doing its job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0658", "title": "The FP32 Fallback Penalty", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is your FP16 model running at FP32 speeds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0659", "title": "The GQA/MQA Memory Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why didn't an 8x reduction in KV-cache size yield an 8x reduction in latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0660", "title": "The MoE Memory Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 47B MoE with only 2 active experts per token OOM on a 40 GB card when a dense 13B model would easily fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0661", "title": "The PCIe ACS Block", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the CPU getting involved in a direct GPU-to-GPU transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0662", "title": "The Multi-LoRA Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you serve 100 LoRA adapters for a shared Llama-70B base model, including memory, swapping, and batching across adapters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0663", "title": "The Vision-Language Model Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where are the 7.8 extra seconds going?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0664", "title": "The CPU-Bound Generation Loop", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What part of the Python generate loop is physically preventing the GPU from running faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0665", "title": "The Attention Cost Explosion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does 32\\times more context cost 10\\times more to serve, and what are our architectural options?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0666", "title": "The TPU v5e vs H100 Trade-off", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why isn't choosing TPU v5e over H100 for serving a 7B model just a $/chip-hour comparison?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0667", "title": "The Sparsity Fallacy", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does doing less work take more time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0669", "title": "The Disaggregated Serving Architecture", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do we structurally isolate compute-bound long prompts from memory-bound token generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0670", "title": "The Decoding Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can we generate tokens faster without changing the model weights, quantizing, or losing exact mathematical accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0671", "title": "The Multi-Modal Token Starvation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why is the manager focusing on the wrong part of the stack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0672", "title": "The Router Bottleneck in MoE Serving", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What architectural component of the MoE model is destroying your memory bandwidth?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 3}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0673", "title": "The PCIe Switch Starvation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Where is the specific hardware bottleneck starving the GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0674", "title": "The Prefill-Decode Disaggregation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why would dedicating GPUs to different phases actually improve both throughput and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0675", "title": "The Compilation Overhead", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What happened inside the framework to cause this latency drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0676", "title": "The Inference Compiler Optimization", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the real hardware-level reason fusion is so effective?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 2}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0677", "title": "The Gaudi 3 Compiler Bet", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the real systems trade-offs between hand-written CUDA kernels and Gaudi's graph compiler?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0678", "title": "The Automated Model Optimization Pipeline", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an automated model optimization pipeline that takes <1 hour and keeps quality regression under 3% on 95% of models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0679", "title": "The Stalled Data Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What's the likely culprit for the data_loader bottleneck, and how would you diagnose and mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0680", "title": "The Gradient Accumulation Equivalence", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "In what two cases does gradient accumulation with batch 64x16 stop being mathematically identical to true batch 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0681", "title": "The Training Time Estimate", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long will one epoch take, and how does the data pipeline bottleneck impact cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0682", "title": "The Distributed Training Data Bottleneck", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the most likely bottleneck, and how would you systematically diagnose and resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0683", "title": "The ZeRO-1 Memory Squeeze", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why didn't ZeRO-1 save you enough memory?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 2}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0684", "title": "What is DDP?", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does DDP stand for and what is its primary function?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Data Driven Processing; it automatically cleans the dataset.", "Distributed Data Parallel; it copies the model to all GPUs and splits the data batch.", "Dynamic Device Partitioning; it splits the layers of a single model across GPUs.", "Distributed Data Parallel; it shards the model weights across GPUs to save memory."], "correct_index": 1}}, {"id": "cloud-0685", "title": "The All-Reduce Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If one GPU is slightly slower (a straggler), what happens to the rest of the cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The fast GPUs proceed and calculate asynchronous gradients.", "The PyTorch dispatcher automatically re-assigns the batch to a faster node.", "The entire cluster stalls at the synchronization barrier, wasting compute time.", "The cluster drops the straggler's gradients to maintain high throughput."], "correct_index": 2}}, {"id": "cloud-0687", "title": "The Optimizer Explosion", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does Adam mixed-precision training of a 30B model OOM, and which sharding strategy fixes it?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 0}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0688", "title": "The Communication Wall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on the network diagram, why has our scaling efficiency collapsed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0690", "title": "The Memory Copy Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Based on the architecture and symptoms, what is the 'silent killer' limiting your throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0691", "title": "The Resource Contention Stutter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the diagram, why can't your GPUs handle both tasks simultaneously?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0692", "title": "The Memory Swiss Cheese", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the memory map, why is your VRAM already exhausted?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0693", "title": "The Checkpoint Traffic Jam", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Based on the storage topology, what two physical bottlenecks are you hitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0695", "title": "The KV-Cache Network Wall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Based on the diagram, what physical link is destroying your latency gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0698", "title": "The Pipeline Bubble Tax", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "With P=4, M=16, and T_stage=50 ms, how do you calculate the total global-batch time, and what is it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["800 ms", "3200 ms", "1000 ms", "950 ms"], "correct_index": 3}}, {"id": "cloud-0699", "title": "The AllReduce Bottleneck", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which communication primitive is the most common scaling bottleneck when moving DDP from 8 to 128 GPUs?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 0}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The data loading (ETL) pipeline", "The optimizer step (e.g., AdamW)", "The AllReduce operation", "The forward pass computation"], "correct_index": 2}}, {"id": "cloud-0700", "title": "The FSDP Memory Calculation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Using FSDP/ZeRO-3 on 16 H100s, how much memory per GPU is needed for parameters, gradients, and Adam states for a 70B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1120 GB", "8.75 GB", "70 GB", "140 GB"], "correct_index": 2}}, {"id": "cloud-0701", "title": "The Training Cost Estimate", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the estimated H100 training cost for a 70B model on 2T tokens at $3.50/GPU-hour, and what is the biggest budget risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0702", "title": "The Phantom Performance Drop", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you quickly pinpoint if the input data for the production model is different from the training data, and what's the most common culprit?", "chain_ids": ["cloud-chain-auto-003-14"], "chain_positions": {"cloud-chain-auto-003-14": 1}, "chain_tiers": {"cloud-chain-auto-003-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0704", "title": "The Distributed Training Choke Point", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What primary bottleneck causes the 50% per-GPU throughput drop when scaling the 70B job from one 8x GPU node to two nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 140 GB gradient transfer over PCIe Gen5 adds 4.4 seconds of latency per step, saturating the bus.", "The 16-GPU ring AllReduce transfers 262.5 GB over the 50 GB/s InfiniBand, increasing step time to 5.25s.", "The TCP/IP encapsulation overhead adds 2.5s of protocol serialization delay, halving throughput.", "Model parameters expand to 280 GB in distributed FP32, exceeding single-node NVLink limits."], "correct_index": 1}}, {"id": "cloud-0705", "title": "The Tensor Parallelism Scaling Trap: Collective Communication", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 1.2× scaling expected or anomalous, and what is the most likely cause of the poor scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The data transfer from host CPU memory to GPU memory over PCIe is the bottleneck.", "The RDMA protocol is adding excessive latency overhead, which accounts for the slowdown.", "The performance is expected; the bottleneck is the ~18x bandwidth gap between intra-node NVLink (900 GB/s) and inter-node InfiniBand (50 GB/s).", "One of the nodes must have a faulty NVLink switch that is slowing down the entire 16-GPU communication ring."], "correct_index": 2}}, {"id": "cloud-0707", "title": "The AllReduce Scaling Trap", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What communication bottleneck causes poor scaling when the 70B FP16 job moves from one 8×H100 node to two nodes over InfiniBand NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated because gradients must be copied to CPU RAM taking ~4.3s before being sent to the network.", "The NVLink 4.0 interconnect within each node is the bottleneck; it cannot handle the 140GB gradient exchange.", "The inter-node InfiniBand NDR network fabric is the bottleneck, as its bandwidth is much lower than the intra-node NVLink fabric.", "The CPUs on each node are overwhelmed with coordinating the RDMA transfers, starving the GPUs of instructions."], "correct_index": 2}}, {"id": "cloud-0709", "title": "The Data Parallelism Scaling Efficiency", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What explains the sub-linear scaling, and at what GPU count does adding more GPUs actually hurt throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0710", "title": "The Tensor Parallelism Degree", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which TP degree—2, 4, or 8—is optimal for the 70B LLM on H100 NVLink under a 40 ms per-token latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0711", "title": "The ZeRO-3 Communication Overhead", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the ZeRO-3 throughput 40% slower, and where exactly is the GPU time going?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0713", "title": "The Pipeline Bubble Cost", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much GPU-time is wasted in the pipeline bubble, and how many microbatches do you actually need to keep the bubble under 5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0714", "title": "The Heterogeneous GPU Training", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What happens when you combine A100 and H100 GPUs in a single DDP training job, and how can you use both efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0715", "title": "The Async SGD Staleness Problem", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is that the full story, and can you quantify the staleness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0716", "title": "The Pipeline Stutter (1F1B)", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What non-compute operation broke the 1F1B rhythm?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0717", "title": "The Idempotent Training Pipeline", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign this pipeline to be more fault-tolerant and cost-efficient, specifically focusing on making its stages idempotent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0718", "title": "The Straggler Log Rotation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the cron job doing that halts the entire cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0719", "title": "The Expert Parallelism Communication", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "At what point does the network become the bottleneck in this MoE training setup, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0720", "title": "Dimensioning the 3D Cube", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you allocate the dimensions for Data (D), Tensor (T), and Pipeline (P) parallelism for a 175B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0721", "title": "The 3D Parallelism Orchestration", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you assign Tensor (T), Pipeline (P), and Data (D) parallelism across 1,024 H100s for a 175B, 96-layer model, and what physical constraint justifies each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0722", "title": "The ZeRO-3 Cross-Node Thrashing", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why did a slight batch size increase destroy the network?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 3}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0724", "title": "The Heterogeneous Cluster Scheduler", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a heterogeneity-aware scheduler for a mixed-generation cluster to maximize utilization and cost efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0726", "title": "The Collective Communication Primitives", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For each collective, name a training strategy that uses it and estimate the per-GPU communication volume for a 1 GB tensor on 8 GPUs.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0727", "title": "The All-Reduce Stalemate", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why can communication dominate for both a 1M and a 100B model on 8-node data-parallel training, but for different reasons?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0728", "title": "The AllReduce Tax", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How long does the AllReduce take, and what fraction of the training step is spent on communication?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 2}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0729", "title": "The Cross-Rack Stall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What did we misunderstand about network topology and Tensor Parallelism?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 2}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0730", "title": "The InfiniBand vs RoCE Decision", "topic": "compound-ai-systems", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which networking option do you choose, and when does the cheaper option break down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0731", "title": "The Topology Trap", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which common network topology might be causing this, and which would you prefer for pipeline parallelism, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0732", "title": "NCCL's Uneven Footing", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose and mitigate this inconsistent Data Parallelism performance, focusing on NCCL's behavior?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0733", "title": "The AllReduce Incast Congestion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What physical network phenomenon is causing this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0734", "title": "The NCCL Topology Misconfiguration", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is NCCL ignoring your expensive InfiniBand network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0735", "title": "The ToR Switch Buffer Microburst", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the network freezing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0736", "title": "The Ring AllReduce Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ring AllReduce degrade massively at scale despite constant per-GPU bandwidth, and what replaces it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0737", "title": "The NVLink Domain Boundary", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What physical boundary did you cross?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 3}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0738", "title": "The Oversubscription Choke", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0739", "title": "The Congestion Collapse", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What network physical phenomenon occurred?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0740", "title": "The Gradient Synchronization Overlap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Under what conditions is this true, and when does the overlap break down?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 3}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0741", "title": "The Network Congestion Collapse", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is happening to cause this throughput drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0742", "title": "The Congested Highway", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "You suspect network congestion, but how do you verify this and implement a system-level solution to ensure more predictable network performance for critical ML workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0743", "title": "The InfiniBand Subnet Saturation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural constraint in the Fat-Tree topology is causing the network efficiency to plummet for cross-pod jobs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0744", "title": "The Gradient Compression Paradox", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why won't they get anywhere near 100× improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0745", "title": "The InfiniBand Link Flap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a single flapping InfiniBand link stall all 256 GPUs in the cluster?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0746", "title": "The Bisection Bandwidth Requirement", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What minimum bisection bandwidth is needed to avoid communication bottlenecks, and is a fat-tree topology sufficient or optimal for this 1,024-GPU 3D-parallel workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0747", "title": "The Ring vs Tree Dilemma", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does the 'best' algorithm fail here?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0748", "title": "The Cross-Datacenter Training", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Is synchronous training feasible across this WAN link, and if not, what is your alternative architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0749", "title": "The Network Topology Tax", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Should we trust this recommendation, or is the fat-tree worth the premium?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0750", "title": "The Spot Instance Checkpoint Strategy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How often should you checkpoint to minimize wasted time, and what's the expected cost savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0751", "title": "The Straggler Problem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the total step time for the cluster, and what percentage of cluster compute is wasted due to this single node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0752", "title": "The Elastic Training Scaling", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you handle GPUs appearing and disappearing mid-training without restarting from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0753", "title": "The Checkpoint Storage Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What's the checkpoint size, how long does the write take, and what's the impact on training throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0754", "title": "The Checkpoint Resurrection", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the PM's 30-minute recovery estimate dangerously optimistic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0755", "title": "The Straggler Mitigation Problem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What fraction of steps will have at least one straggler, and how do you mitigate this without switching to async SGD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0756", "title": "The Failure Recovery Time", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the expected failures per day and effective utilization, and what architecture gets the 10,000-GPU job above 90% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0757", "title": "The Unstable Cluster", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What fault-tolerance strategy would you design to minimize downtime for this 1T-parameter training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0758", "title": "The Checkpoint Serialization Freeze", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why won't faster S3 fix the 3-minute stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0759", "title": "The NFS Checkpoint Corruption", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What happened to the step-79,000 checkpoint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0760", "title": "The Silent ECC Degradation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What hardware component is silently degrading, and when does this 'slightly different' behavior become dangerously wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0761", "title": "The Optimal Checkpoint Interval", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes wasted time (checkpoint overhead plus expected lost work from failures)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0762", "title": "The MTBF Crisis", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Is saving a checkpoint for 5 minutes every hour viable at 10k GPU scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0763", "title": "The Fault-Tolerant Training Framework", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What fault-tolerant training framework would achieve >95% effective utilization with 2,048 GPUs and a cluster MTBF under 30 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0764", "title": "The Silent Data Corruption at Scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What could cause a model to silently underperform, and how would you detect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0765", "title": "The Cosmic Ray Divergence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How can a cosmic ray bit flip corrupt a model without crashing it, and how do you find which of the 70 billion parameters is wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0766", "title": "The Split-Brain Checkpoint", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you recover without losing more than 1,000 steps of work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0767", "title": "The Warmup Learning Rate Schedule", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the systems-level reason warmup is physically necessary for large-batch training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0768", "title": "The DDP Bucket Straggler", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why isn't the DDP communication overlap hiding the 200ms delay of throttled GPU 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0771", "title": "The Data Parallel Straggler", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "By exactly how much does this single degraded node slow down the entire 256-GPU training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0772", "title": "The NCCL Timeout", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the random NCCL timeout hangs across different ranks, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0773", "title": "The Gradient Overflow", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What's happening on those 3 GPUs, and why does it infect the entire cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0775", "title": "The Reproducibility Paradox", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Where are the hidden sources of non-determinism, and what does it cost to eliminate them on a modern GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0776", "title": "The Global Model", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you train a single foundation model across US, EU, and APAC without moving raw regional data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0777", "title": "Multi-modal Candidate Generation at Billion-Scale", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design the embedding architecture, index, and serving infrastructure to support multi-modal retrieval at this scale for Instagram Reels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0778", "title": "Real-Time Click Prediction with Continual Learning", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the streaming ingestion and distributed training pipeline to update CTR weights within 5 minutes while correcting for delayed clicks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0779", "title": "Scaling Foundation Models on Trillions of Tokens", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect the distributed training strategy (3D parallelism) and specifically optimize the DDP communication overhead to maximize Model Flops Utilization (MFU)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0780", "title": "Global Scale Real-Time Two-Tower Recommendation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the embedding table distribution and serving infrastructure on TPU v5e pods to ensure sub-50ms p99 latency while maximizing TPU High-Bandwidth Memory (HBM) utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0781", "title": "Multi-Turn Gemini LLM Serving with PagedAttention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design TPU v5p serving for 1M-token chats to reduce KV-cache fragmentation while meeting a 2-second TTFT SLA?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 3}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0782", "title": "Trillion-Parameter MoE Training on TPU Torus Topology", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you map 3D plus expert parallelism for a 2T MoE on 10,000+ TPU v5p chips to avoid OCS bottlenecks and reach 45% MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0783", "title": "The GPU Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the energy cost for this training run, and how does it compare to a cloud compute cost of $3.50/GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0784", "title": "The Container Bloat", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the primary culprits for such a large image, and how would you systematically reduce it to fix cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0785", "title": "The Unresponsive Replica", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What's likely going wrong with our health check strategy and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0788", "title": "The Thermal Memory Wall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What thermal component are you failing to monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0789", "title": "The Attention Skew", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does an evenly chunked sequence cause an asymmetric memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0790", "title": "The Compilation Wall", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is killing your performance and causing the 48-hour start time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0791", "title": "The Root Complex Choke", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Based on the PCIe topology, why didn't adding more NVMe drives fix the IO bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0792", "title": "The Memory Illusion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the throughput reality of this unified-memory design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0793", "title": "The Ephemeral Storage I/O Cliff", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the 1 TB dataset missing after the spot instance is preempted and training resumes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0794", "title": "The Silent Regression", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you have prevented this silent regression and what deployment strategy would you advocate for future model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0795", "title": "The Silent Failure", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the high GPU utilization mask the failure, and how can the hardware be healthy while the ML output is perfectly wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0796", "title": "The Training-Serving Skew", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do the different hardware paths cause this numerical divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0797", "title": "The Model Deprecation Cliff", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does maintaining multiple model generations simultaneously fragment your serving cluster and destroy GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0798", "title": "The LLM Evaluation Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might this evaluation be dangerously misleading, and how could deploying the better model destroy serving economics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0799", "title": "The Retraining Math", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Exactly how often should you trigger a retraining pipeline to minimize total costs?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 4}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0800", "title": "The Stale Feature Store", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural flaw causes online fraud model degradation after offline feature pushes, and how would you keep features fresh and consistent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0801", "title": "The Deduplication Economics", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Who is right, and what is the economically optimal deduplication strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0802", "title": "The Silent Schema Shift", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you prevent silent schema shifts in a petabyte-scale ML pipeline from degrading downstream models?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 3}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0803", "title": "The PII-Sensitive Training Dilemma", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which privacy-preserving training approaches would you use for regulated PII data, and how do their utility, complexity, and cost trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0804", "title": "The Exploding Data Lake Bill", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you cut storage costs for a 500 PB S3 data lake while preserving availability for ML workloads and compliance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0806", "title": "The PUE Dollar Cost", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the $5M liquid-cooling retrofit worth it for reducing PUE from 1.4 to 1.1, and what is the payback period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0808", "title": "The Thermal Throttling Mystery", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the identical Phoenix cluster training 30% slower than the Oregon cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0809", "title": "The Spot Instance Gamble", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the true expected cost of spot training considering preemption rates, and when does on-demand actually become cheaper?", "chain_ids": ["cloud-chain-auto-018-03"], "chain_positions": {"cloud-chain-auto-018-03": 0}, "chain_tiers": {"cloud-chain-auto-018-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0810", "title": "The Energy Bill", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What were the energy consumption and carbon footprint of the 30-day 256× H100 training run in a US data center?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0811", "title": "The Energy Economics", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why is the $100M figure severely underestimating the budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0812", "title": "The Carbon-Aware Scheduler", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a carbon-aware scheduler for the 70B run, and do the numbers prove carbon offsets are cheaper than moving workloads?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 3}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0813", "title": "The Carbon-Neutral Training Scheduler", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule 500 weekly training jobs across Virginia, Oregon, and Ireland to minimize carbon within 15% of baseline cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0814", "title": "The Floating Point 32 Checkpoint Tax", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is taking up the extra 280 GB, and can you delete it before deploying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0815", "title": "The Model Distillation Economics", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should we do the 2-week 32x A100 distillation to replace the $180k/month 70B service with a 7B A10G student?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0816", "title": "The S3 Data Wall", "topic": "data-efficiency-selection", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did streaming from object storage starve your compute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0817", "title": "The Parquet Row Group Chunking", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why didn't the columnar format save you bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0818", "title": "The Data Gravity Gravity Well", "topic": "mlops-lifecycle", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you engineer the training pipeline to connect the data to the GPUs, and what is the hidden economic catastrophe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0819", "title": "The Transformer Weight Footprint", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much VRAM do the weights alone consume, and does the model fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 7 GB (INT8 Trap)", "B) 14 GB", "C) 28 GB (FP32 Trap)", "D) 56 GB (AdamW State Trap)"], "correct_index": 1}}, {"id": "cloud-0820", "title": "The Tokenizer Throughput Ceiling", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how many tokens per second must the CPU tokenizer produce to keep an H100 busy on a 7B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~700 tokens/sec", "B) ~7,000 tokens/sec", "C) ~70,000 tokens/sec", "D) ~700,000 tokens/sec"], "correct_index": 2}}, {"id": "cloud-0821", "title": "The PCIe Transfer Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How long does it take to transfer a single batch from CPU to GPU over PCIe Gen5 x16 (64 GB/s unidirectional)?", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 0}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~4.8 ms", "B) ~19.2 ms", "C) ~154 ms", "D) ~76 ms"], "correct_index": 1}}, {"id": "cloud-0822", "title": "The Kernel Fusion Memory Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much HBM traffic does fusion eliminate for this tensor?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 0}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~10% of HBM traffic", "B) ~25% of HBM traffic", "C) ~50% of HBM traffic", "D) ~90% of HBM traffic"], "correct_index": 2}}, {"id": "cloud-0823", "title": "The Gradient AllReduce Time", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How long does the AllReduce synchronization take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~11 ms", "B) ~54 ms", "C) ~109 ms", "D) ~1,090 ms"], "correct_index": 2}}, {"id": "cloud-0824", "title": "The ECC Bit Error Reality", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Assuming a raw bit error rate of 1e-15 per bit per hour (before ECC correction), roughly how many uncorrected bit errors would you expect per hour without ECC?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 0}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~0 errors (bits never flip)", "B) ~0.00064 errors/hour", "C) ~6.4 errors/hour", "D) ~640 errors/hour"], "correct_index": 1}}, {"id": "cloud-0825", "title": "The Checkpoint Size Math", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the total checkpoint size including weights (FP16), Adam optimizer states (FP32 momentum + variance), and gradients (FP32)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 26 GB", "B) 78 GB", "C) 182 GB", "D) 364 GB"], "correct_index": 2}}, {"id": "cloud-0826", "title": "The MoE Sparse Activation Ratio", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the effective compute cost per token of this top-2 MoE compared with a dense 47B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100% (94 GFLOPs) - Assuming all parameters are active for every token.", "~36% (34 GFLOPs) - Only backbone + top-2 experts activate.", "~21% (20 GFLOPs) - Forgetting to include the 7B shared backbone.", "~25% (24 GFLOPs) - Assuming top-2 routing without multiplying the 5B expert size by 2."], "correct_index": 1}}, {"id": "cloud-0827", "title": "The Cross-Node AllReduce Cost", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the estimated ring AllReduce time, assuming cross-node InfiniBand is the bottleneck?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 1}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~50 ms", "B) ~200 ms", "C) ~2,000 ms", "D) ~20,000 ms"], "correct_index": 2}}, {"id": "cloud-0828", "title": "The NVLink vs PCIe Tensor Parallel Gap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the per-layer communication time over NVLink 4.0 (450 GB/s) versus PCIe Gen5 x16 (63 GB/s) for batch=32, seq=2048, hidden=4096 in FP16, and is PCIe tensor parallelism viable?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 0}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) NVLink is ~2x faster — minor difference", "B) NVLink is ~5x faster — noticeable but manageable", "C) NVLink is ~7x faster — PCIe makes tensor parallelism impractical", "D) NVLink is ~100x faster — different technology class"], "correct_index": 2}}, {"id": "cloud-0829", "title": "The Flash Attention Memory Cliff", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory does standard attention need for the attention matrix versus FlashAttention-2's working memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0830", "title": "The Cluster MTBF Math", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How often should you expect a GPU failure to interrupt training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Once per month — 10,000-hour MTBF is very reliable", "B) Once per week — failures are uncommon", "C) Once every ~10 hours — cluster reliability compounds", "D) Once per hour — GPUs are inherently unreliable"], "correct_index": 2}}, {"id": "cloud-0831", "title": "The Async Checkpoint Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "A synchronous checkpoint would stall training for how long, and how does async checkpointing fix this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~2 seconds — NVMe is fast enough", "B) ~23 seconds — manageable with less frequent checkpoints", "C) ~233 seconds — catastrophic without async checkpointing", "D) ~2,333 seconds — checkpointing is practically impossible"], "correct_index": 2}}, {"id": "cloud-0832", "title": "The Datacenter PUE Cooling Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the total facility power, annual electricity cost, and non-GPU overhead for the 10,000-H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0833", "title": "The TCO Per Token Analysis", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At what sustained daily token demand does building on-prem inference capacity become more cost-effective than a $0.50 per million token cloud API?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 3}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0834", "title": "The Spot Instance Training Economics", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Are spot instances worth the operational complexity for this 1,000 GPU-hour fine-tuning job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0835", "title": "The Carbon-Aware Scheduling Tradeoff", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is carbon-aware scheduling to the renewable region worth the 20% capacity and 15% data-latency tradeoff for a 1,000 GPU-hour training job?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 2}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0838", "title": "Recommender Migration Latency Degradation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should they analyze the profile traces to root-cause this anomaly across the hardware spectrum?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0839", "title": "Evaluating A/B Test Trade-offs for Heavy Models", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 20% two-week A/B test proceed as requested, or is an alternative architecture needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0841", "title": "High-Resolution ViT OOM Analysis", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bottleneck, and what compute-based trade-off would fit the 1024x1024 ViT in 80GB without CPU offloading?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0842", "title": "Evaluating Activation Checkpointing Trade-offs for LLMs", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use full or selective activation checkpointing to reduce the 105GB per-GPU footprint, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0843", "title": "Activation Sparsity Memory Compression", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the exact compressed activation size in MB after storing a 1-bit bitmask and the non-zero FP32 values?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0844", "title": "Diagnosing Latency Spikes after GELU Replacement", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did replacing ReLU with GELU spike token latency from 75ms to 140ms and increase server power by 40%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0845", "title": "Calculate Activation Checkpointing Memory Savings", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the peak activation memory with layer-boundary checkpointing, and will the 2B model OOM on an 80GB A100?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 0}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0846", "title": "Diagnosing AOC Link Flaps and Tail Latency", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this performance degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0847", "title": "Cascading Failures in Active-Active Inference APIs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did losing one AZ take down the whole API, and what capacity-planning flaw caused the cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0848", "title": "Sizing Multi-Region Active-Active LLM Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many total nodes are needed across two regions for zero-downtime failover, and what is normal peak utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0849", "title": "Global Active-Active Evaluation for Fraud Detection", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the cost and latency trade-offs of using global Active-Active versus local Active-Passive to survive one region failure under a 100ms p99 SLA.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0850", "title": "AOC Power and Latency Overhead Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total continuous power overhead of the AOC fabric and the one-way latency penalty for a cross-rack GPU-to-GPU message compared to a local DAC-only route?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0851", "title": "Adversarial Training Compute Overhead", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much additional compute time per epoch does FGSM adversarial training add for the 5-million-image ResNet-50 dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0852", "title": "MoE All-to-All Network Load Imbalance", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause, and how do you resolve the bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0853", "title": "Routing MoE All-to-All Bursts at Scale", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is ECMP on a non-blocking Fat-Tree sufficient for 32,000-GPU MoE All-to-All bursts, or should you use Adaptive Routing?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0854", "title": "Evaluating Adversarial Debiasing Dynamics in Credit Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did the GRL adversarial debiasing setup hurt throughput and AUC, and how would you redesign it?", "chain_ids": ["cloud-chain-auto-secondary-014-30"], "chain_positions": {"cloud-chain-auto-secondary-014-30": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0855", "title": "Diagnosing Adversarial Debiasing Instability in NLP", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the GRL debiasing training to plateau main loss at 0.85 while the adversary falls to random accuracy?", "chain_ids": ["cloud-chain-auto-secondary-014-30"], "chain_positions": {"cloud-chain-auto-secondary-014-30": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0856", "title": "Diagnosing Moderation Evasion Attacks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this failure and what is the root cause?", "chain_ids": ["cloud-chain-auto-secondary-015-24"], "chain_positions": {"cloud-chain-auto-secondary-015-24": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0857", "title": "Sizing RoCEv2 Buffers for Adaptive Routing", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum path latency skew the 16MB reassembly buffer can tolerate at 800 Gbps line rate?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0858", "title": "E-commerce Recommendation Alignment Gap", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you modify the ranking objective and architecture to stop optimizing cheap clicks and recover revenue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0860", "title": "Evaluating Defenses for High-Throughput Content Moderation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defense should you deploy under a 50ms SLA and a tight training budget?", "chain_ids": ["cloud-chain-auto-secondary-015-24"], "chain_positions": {"cloud-chain-auto-secondary-015-24": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0861", "title": "Diagnosing Clickbait Degradation in DLRM", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the retention and watch-time drop, and how should the ranking model be realigned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0862", "title": "Diagnosing FSDP AllGather Topology Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and root-cause this collective communication stall?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 2}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0863", "title": "Evaluating Proxy Misalignment in E-Commerce Ranking", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you realign the recommender with GMV while keeping P99 serving latency under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0864", "title": "Ring-AllReduce Transfer Time Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum Ring-AllReduce time to synchronize the 10B FP16 gradients across the 8 GPUs?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 1}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0865", "title": "Ring AllGather Time in FSDP", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum time for the ring AllGather to reconstruct the 2.4GB block across 8 GPUs?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 1}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0866", "title": "Flat vs Hierarchical AllGather", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use flat or hierarchical AllGather for FSDP weight reconstruction across 512 GPUs, and what latency difference drives that choice?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 3}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0867", "title": "MoE AllToAll Communication Time", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum theoretical time for the NVSwitch AllToAll when each A100 sends 1.5GB to each of 7 peers?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 0}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0868", "title": "Evaluating Flat vs. Hierarchical AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which AllReduce topology minimizes latency for a 2GB gradient payload on 1024 A100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0869", "title": "Diagnosing MoE AllToAll Network Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this bottleneck during the AllToAll phase, and how do you mitigate it?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 1}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0870", "title": "Diagnosing Inter-Node AllReduce Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the actual bottleneck causing the 320ms AllReduce latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0871", "title": "Hardware Upgrade Speedup Estimation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the new training step time and overall speedup after moving to H100s with 4x GPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0872", "title": "Evaluating H100 Upgrades for Hybrid Recommendation Models", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is the 8x H100 upgrade justified given the 3x GPU speedup, unchanged CPU/network time, and 4x hourly cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0873", "title": "API Cost and Rate Limits for Model Theft", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the attacker's total cost and sustained RPS over 7 days, and why do burst rate limits fail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0874", "title": "Diagnosing H100 Pipeline Bottlenecks", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the hardware migration deliver only 1.4x end-to-end speedup instead of the 3.2x FP16 TFLOPS gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0875", "title": "Diagnosing Systematic Probing for LLM Extraction", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the 100-query/min user conducting a viable model extraction attack, and how much information leaks via full logprobs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0876", "title": "Diagnosing GPU Starvation in Vision Pipelines", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is bottlenecking the ViT-H training loop, and how would you size the prefetching pipeline to eliminate GPU bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0877", "title": "Sizing Prefetch Buffers for GPU Starvation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many CPU dataloader workers and prefetched batches are needed to match the 80ms GPU step and hide 500ms disk spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0878", "title": "Evaluating API Defenses Against Model Extraction", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defense should you choose against the 2M-query extraction attack, and how do efficacy, revenue, and utility trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0879", "title": "Evaluating Async Prefetching for H100 Starvation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the trade-offs between CPU workers, offline preprocessing, and async GPU decoding (DALI) to stop H100 starvation.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0880", "title": "Dynamic Autograd Tape vs Static Compilation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you wrap the dynamic GNN training step in torch.compile/XLA, or keep eager autograd, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0881", "title": "Autograd Tape Memory Footprint in Dynamic RNNs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much autograd-tape memory do the 15,000 saved FP32 RNN tensors consume, and does it exceed the 10GB activation budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0882", "title": "Diagnosing Autograd Tape Memory Leaks in Gradient Accumulation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does gradient accumulation OOM at micro-batch 14, and how should the training loop use autograd to avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0883", "title": "Autograd Activation Memory vs Recomputation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory does storing H consume per layer, and how much would recomputing H in a custom backward save?", "chain_ids": ["cloud-chain-auto-008-08"], "chain_positions": {"cloud-chain-auto-008-08": 0}, "chain_tiers": {"cloud-chain-auto-008-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0884", "title": "Forward vs. Reverse Mode AD", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you switch the Jacobian computation to forward-mode AD, and what latency should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0885", "title": "KV Cache Memory Bandwidth Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing memory bandwidth saturation during 70B autoregressive decoding, and which component should you redesign?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0886", "title": "PagedAttention vs GQA for KV Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you prioritize PagedAttention or an 8x-KV-head GQA model to maximize 4096-token decoding throughput, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0887", "title": "KV Cache Capacity and Bandwidth Limits", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the KV cache size per token, maximum 2048-token batch size, and total memory read per decoding step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0888", "title": "Estimating Backdoor Poisoning Rate", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many 5x5-triggered poisoned images must the attackers inject to appear in at least 50% of 8192-image batches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0889", "title": "Debugging Autograd Memory Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the backward pass OOM with T=1000 activations, and how can you train without reducing sequence length or batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0890", "title": "Evaluating Backdoor Mitigations in Cloud Code Gen", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use activation-clustering sanitization or inference-time prompt perturbation to neutralize the docstring backdoor, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0891", "title": "Diagnosing Targeted Triggers in Cloud Vision APIs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose whether this is a backdoor data poisoning attack versus a natural adversarial example or a dataset bias, and what specific metrics do you use to isolate the trigger?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0892", "title": "Multi-AZ Link Buffer Exhaustion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What in-flight data is needed to saturate one 400Gbps cross-AZ link over 50km, and can 64MB-buffer ToR switches support it?", "chain_ids": ["cloud-chain-auto-027-05"], "chain_positions": {"cloud-chain-auto-027-05": 0}, "chain_tiers": {"cloud-chain-auto-027-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0893", "title": "Geo-Distributed Training Throughput Collapse", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 400Gbps inter-AZ link capped near 32Gbps, and what buffer or communication redesign is required?", "chain_ids": ["cloud-chain-auto-027-05"], "chain_positions": {"cloud-chain-auto-027-05": 1}, "chain_tiers": {"cloud-chain-auto-027-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0894", "title": "Evaluating Compute Upgrades for LLM Decoding", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will doubling TFLOPs while keeping 2.0TB/s bandwidth halve batch-1 token latency, and what should you do instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0895", "title": "LLM Decoding Throughput on A100", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What speedup should you expect from doubling TFLOPS but keeping 2.0 TB/s bandwidth for batch-1 7B decoding, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0896", "title": "Diagnosing Low SM Utilization in LLM Decoding", "topic": "extreme-quantization", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is upgrading the batch-1 7B LLM service to an H100 a good way to cut latency to 2ms, and what should you do instead?", "chain_ids": ["cloud-chain-auto-secondary-011-24"], "chain_positions": {"cloud-chain-auto-secondary-011-24": 0}, "chain_tiers": {"cloud-chain-auto-secondary-011-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0897", "title": "Bandwidth Taper in DLRM Embeddings", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum time to fetch 1.6GB of embeddings over PCIe Gen4 x16, and how does it compare with HBM2e-resident data?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 1}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0898", "title": "Diagnosing Multi-GPU Pipeline Stalls", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bandwidth bottleneck is starving the GPUs despite 600 GB/s internal bandwidth, and how do you quantify it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0899", "title": "Evaluating Parallelism Mapping Across the Bandwidth Taper", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should tensor parallelism span multiple H100 nodes here, or how should TP, PP, and DP be mapped across the cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0900", "title": "Evaluating WAN Links for Multi-Datacenter LLM Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the two datacenters be pooled for synchronous 175B training, and how should TP, DP, and PP be mapped?", "chain_ids": ["cloud-chain-auto-027-05"], "chain_positions": {"cloud-chain-auto-027-05": 2}, "chain_tiers": {"cloud-chain-auto-027-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0901", "title": "Parquet Batch Ingestion Memory Sizing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum batch time window avoids OOM in the 8 GiB container, and what size Parquet file does that batch produce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0902", "title": "Bare-Metal MCU Memory Allocation Calculation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the peak SRAM utilization during Cortex-M4 inference, and does the 150KB-weight INT8 CNN fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0903", "title": "Evaluating Bare-Metal SmartNIC Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 5-microsecond p99 packet SLA, should inference run on the Linux Xeon host or bare-metal SmartNIC Cortex-M7, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0904", "title": "Debugging HardFaults in Bare-Metal Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the root cause of this crash without OS-level memory profiling tools?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0905", "title": "Diagnosing Small Batch Instability in 3D CNNs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and resolve this architectural bottleneck causing erratic inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0906", "title": "Batch vs Streaming Ingestion for CTR Models", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the Ads CTR pipeline use continuous Kafka streaming or a scheduled 15-minute 1GB Parquet batch ingestion system, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0907", "title": "SyncBatchNorm Communication Overhead", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much data does one GPU transmit during the SyncBN forward-pass Ring AllReduce for 1024 FP32 channels on 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0908", "title": "Normalization for Micro-Batch 3D Segmentation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes the 3D U-Net instability at batch size 1 per GPU, and how should the normalization layers be redesigned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0909", "title": "LLM Batch Size Memory Limit", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What peak VRAM is required for batch size 32, and what maximum feasible batch size should be configured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0910", "title": "Diagnosing S3 Batch Ingestion Bottleneck", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of this ingestion bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0911", "title": "Analyzing Activation Memory OOM at Scale", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ResNet-50 OOM at batch size 512 despite under 1GB of weights and optimizer state, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0912", "title": "Transformer Batch Size and Memory Constraints", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What limits batch-size scaling on the 24GB GPU, what is the maximum viable batch size, and what safe cap would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0914", "title": "Quantifying LLM Benchmark Contamination", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the model's true accuracy on the 400 clean benchmark questions, and how much absolute accuracy inflation came from contamination?", "chain_ids": ["cloud-chain-auto-003-17"], "chain_positions": {"cloud-chain-auto-003-17": 0}, "chain_tiers": {"cloud-chain-auto-003-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0915", "title": "Diagnosing Sudden MMLU Score Spikes in LLM Pre-training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this anomaly and what mitigation should you apply?", "chain_ids": ["cloud-chain-auto-003-17"], "chain_positions": {"cloud-chain-auto-003-17": 1}, "chain_tiers": {"cloud-chain-auto-003-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0916", "title": "Calculating Maximum Batching Window", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum Triton max_queue_delay can safely meet the 100ms SLA at max batch size 16?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 1}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0917", "title": "Low Traffic Latency Spikes", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does p99 latency worsen at 5 QPS, and what maximum batching_window guarantees the 100ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0918", "title": "Evaluating Contamination in Code LLMs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why can the model score 82% on HumanEval yet drop production acceptance to 22%, and how should contamination be prevented?", "chain_ids": ["cloud-chain-auto-003-17"], "chain_positions": {"cloud-chain-auto-003-17": 2}, "chain_tiers": {"cloud-chain-auto-003-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0919", "title": "Multi-Dimensional Resource Fragmentation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are 8-GPU training jobs pending for Insufficient CPU at only 60% utilization, and how is the scheduler stranding GPUs?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 1}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0920", "title": "Evaluating Multi-Dimensional Bin Packing for Stranded Capacity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What multi-dimensional scheduling heuristic would reduce stranded capacity, and how does it improve packing over GPU-only Best-Fit?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 2}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0921", "title": "Calculate Memory and Compute Savings for a BNN", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the BNN weight footprint with 5% of FP32 parameters kept at INT8, and how many 64-bit XNOR-popcount instructions are needed?", "chain_ids": ["cloud-chain-auto-secondary-011-23"], "chain_positions": {"cloud-chain-auto-secondary-011-23": 0}, "chain_tiers": {"cloud-chain-auto-secondary-011-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0922", "title": "Evaluating BNNs for Cloud-Scale Filtering", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 1M QPS traffic filter move from INT8 GPUs to a BNN on cloud FPGAs, and what are the compute and memory trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-011-23"], "chain_positions": {"cloud-chain-auto-secondary-011-23": 1}, "chain_tiers": {"cloud-chain-auto-secondary-011-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0923", "title": "Calculate Cluster Bisection Bandwidth", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the bisection bandwidth of the 1,536-GPU leaf-spine fabric, and what per-GPU bandwidth limits global Ring AllReduce?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 0}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0924", "title": "Diagnosing 4:1 Network Oversubscription in LLM Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What physical network bottleneck explains the 4x slower global AllReduce despite perfect 600GB/s intra-node NVLink utilization?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 1}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0925", "title": "BNN Throughput Regression on Cloud GPUs", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the BNN on NVIDIA T4 1.5x slower than the INT8 version despite the expected 32x memory saving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0926", "title": "Evaluating Topologies for MoE All-to-All", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 4096 GPUs running MoE All-to-All, should you choose a non-blocking Fat-Tree or a 3D Torus, and what bisection bandwidth drives the choice?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 2}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0928", "title": "Multi-Dimensional Bin Packing and Stranded GPUs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using GPU-sorted First-Fit Decreasing, how many nodes are required for the 5 fine-tuning and 10 serving jobs, and how many GPUs are stranded?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 0}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0929", "title": "Decoupling Shared Embeddings", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the shared 30GB embedding model be decoupled or versioned to stop Team A's update degrading Team B under the 15ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0930", "title": "Revenue Impact of Implicit Data Dependency Erosion", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much revenue is lost over the 72-hour weekend from the 1.5% click-yield drop, and what boundary control would prevent this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0931", "title": "Byzantine Failures in Distributed GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Over 4 weeks on 8,192 GPUs, how many Byzantine failures are expected, and what is the chance of at least one?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 0}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0932", "title": "Evaluating BSP Mitigation Strategies for Stragglers", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use pipeline async, Asynchronous Parallel (ASP), or mitigated BSP for this 2048-GPU training cluster, and why?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 3}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0933", "title": "Diagnosing Stragglers in BSP Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does a single GPU that is 400ms late to the BSP AllReduce affect 1,024-GPU throughput and utilization?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 2}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0934", "title": "Diagnosing Silent Data Corruption in Distributed LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose and isolate the root cause of this failure without halting the entire cluster for days?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 1}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0935", "title": "Evaluating SDC Mitigations in ZeRO-3 Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which SDC mitigation best fits the 250ms ZeRO-3 step budget, and why?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 2}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0936", "title": "Bulk Synchronous Parallel Straggler Impact", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the new BSP step time when one GPU takes 180ms to compute, and what is the compute utilization of the 63 healthy GPUs?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 1}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0937", "title": "A100 KV Cache L2 Thrashing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory hierarchy bottleneck causes the batch-16 throughput collapse, and how would you fix the kernel?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 2}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0938", "title": "GPU L1 Cache Tiling for Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the largest power-of-two square tile size B that fits Q, K, and V in shared memory after reserving 32KB?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 1}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0939", "title": "Evaluating IO-Aware Attention Tiling on A100 Caches", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which tiling strategy provides better utilization of the Tensor Cores, and why?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 3}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0940", "title": "CUDA Allocator Fragmentation Estimation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the 4GB allocation fail despite 16GB of inactive reserved memory, and how should you fix the allocator behavior?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0941", "title": "Diagnosing PyTorch OOM with Dynamic Sequence Lengths", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can the 2GB allocation fail with 25GB reserved but unallocated, and what production fix handles dynamic sequence lengths?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0942", "title": "Capacity Planning for Canary Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many dedicated g5.xlarge instances are required for the 5% v2 canary to guarantee the 200ms SLA?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 1}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0943", "title": "Mitigating Memory Fragmentation in Dynamic LLM Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate disabling the allocator versus padding to 2048 tokens, and what memory strategy should replace them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0944", "title": "Canary Traffic Batching Timeouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of the P99 latency spike, and how do you resolve it?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 2}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0945", "title": "Canary Traffic Sizing and Resource Allocation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you choose between a 1% and 10% canary split given the 100-GPU limit and CTR significance needs?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 3}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0946", "title": "Queueing Delay in Provisioning", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did provisioning 75 instances fail at 50 RPS, and how many instances are needed to meet the 5s P99 SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0947", "title": "SLA-Constrained Inference Capacity", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the 19-GPU Batch-32 plan violate the 100ms P99 SLO, and how many GPUs are actually required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0948", "title": "GPU Capacity Planning with Batching Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming negligible batch accumulation time, how many GPUs are required to handle 4,000 RPS under the 150ms P99 SLO while minimizing cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0949", "title": "WAN Overhead in Carbon Scheduling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did moving the 10-hour 1,024-GPU job to EU-North increase carbon despite the cleaner grid?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0950", "title": "Centralized Checkpointing Network Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the absolute minimum time required just to transfer the checkpoint data from the workers to the coordinator?", "chain_ids": ["cloud-chain-auto-004-03"], "chain_positions": {"cloud-chain-auto-004-03": 0}, "chain_tiers": {"cloud-chain-auto-004-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0951", "title": "Carbon-Aware Workload Shifting", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many metric tons of CO2 are saved by delaying the 250-node, 50-hour preprocessing job by 12 hours?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 0}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0952", "title": "Evaluating Centralized Checkpointing Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why can't the centralized head-node checkpoint meet the 2-minute SLA, and what architecture should replace it?", "chain_ids": ["cloud-chain-auto-004-03"], "chain_positions": {"cloud-chain-auto-004-03": 2}, "chain_tiers": {"cloud-chain-auto-004-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0953", "title": "Centralized Checkpoint Incast Failure", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What bottlenecks cause the 4.5-minute centralized checkpoint stall and rank-0 packet drops?", "chain_ids": ["cloud-chain-auto-004-03"], "chain_positions": {"cloud-chain-auto-004-03": 1}, "chain_tiers": {"cloud-chain-auto-004-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0954", "title": "Evaluating Elastic Training for Carbon Minimization", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do Region 1 continuous training and Region 2 solar-only intermittent training compare on CO2 and time-to-market?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0955", "title": "Scaling Randomized Smoothing Certification", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you serve N=10,000 randomized-smoothing inferences under a 250ms P99 synchronous SLA?", "chain_ids": ["cloud-chain-auto-secondary-015-20"], "chain_positions": {"cloud-chain-auto-secondary-015-20": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0956", "title": "Certified Radius Calculation for Biometric API", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum top-class probability p_A is required with σ=0.5 to certify radius R=0.5?", "chain_ids": ["cloud-chain-auto-secondary-015-20"], "chain_positions": {"cloud-chain-auto-secondary-015-20": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0957", "title": "ZeRO-3 Checkpoint Stalls on Lustre", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the 7TB checkpoint take 20 minutes instead of 35 seconds, and how would local NVMe staging unblock training?", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 0}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0958", "title": "Diagnosing Distributed Checkpoint IO Storm Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What causes the checkpoint-induced collective timeouts, and what checkpointing architecture should replace the central NFS path?", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 1}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0959", "title": "Mitigating Checkpoint Storms in LLM Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you buy 500GB/s Lustre or use two-tier local-NVMe checkpointing for the 2.45TB checkpoints, and why?", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 2}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0960", "title": "Optimal Checkpoint Frequency for LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the optimal checkpointing interval for a 1.6TB checkpoint, 80GB/s writes, and 24-hour MTBF?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0961", "title": "Diagnosing Checkpoint Stalls in Large Scale LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What bottleneck causes the 40-minute FSDP checkpoint stall when gathering to rank 0, and how should it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0963", "title": "Optimizing LLM Checkpoint Intervals at Scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With a 12h MTBF and 120s Lustre writes, what checkpoint interval is optimal, and is async node-local NVMe worth implementing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0964", "title": "Debugging Randomized Smoothing Radius Collapse", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does truncating randomized-smoothing samples from 100,000 to 200 collapse the certified radius to 0, and how should you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-20"], "chain_positions": {"cloud-chain-auto-secondary-015-20": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0965", "title": "Diagnosing Thread Exhaustion Cascades", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the total ThreadExhaustion outage when the feature store p99 hit 5s, and what mechanism contains it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0966", "title": "Evaluating Circuit Breaker Thresholds for Embedding APIs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is a circuit breaker preferable to retries for the 10,000QPS embedding API, and what timeout, threshold, and fallback should it use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0967", "title": "Diagnosing Clean-Label Backdoors in KYC Models", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How was the glasses backdoor embedded without label flipping, and how would you detect the poisoned samples at 10M-image scale?", "chain_ids": ["cloud-chain-auto-003-08"], "chain_positions": {"cloud-chain-auto-003-08": 1}, "chain_tiers": {"cloud-chain-auto-003-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0968", "title": "Clean-Label Poisoning Ratio Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the minimum number of clean-label poisoned samples needed to reach a 5% poisoning ratio in the 10,000-sample target class?", "chain_ids": ["cloud-chain-auto-003-08"], "chain_positions": {"cloud-chain-auto-003-08": 0}, "chain_tiers": {"cloud-chain-auto-003-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0969", "title": "Evaluating Defenses for Clean-Label Poisoning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Given a strict 72-hour cluster allocation limit, should you defend with DP-SGD or activation-space clustering, and why?", "chain_ids": ["cloud-chain-auto-003-08"], "chain_positions": {"cloud-chain-auto-003-08": 2}, "chain_tiers": {"cloud-chain-auto-003-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0970", "title": "CPO Power Savings Calculation", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-switch peak power savings from CPO and annual energy savings for the 500-switch cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0971", "title": "Evaluating CPO vs Pluggable Optics for 51.2T Switches", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 100,000-GPU fabric use 800G pluggables or CPO under the 40kW rack limit, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0973", "title": "Cold Restart Recovery Time Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the total time to recovery after the switch failure before training can resume?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 1}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0974", "title": "Evaluating Cold Restart vs Warm Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you approve the 2-minute warm restart system or keep the 15-minute cold restart architecture, and why?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 3}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0975", "title": "Diagnosing Cold Restart Read Storms", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bottleneck causing the 45-minute cold-restart delay and S3 503s, and what checkpoint-loading architecture fixes it?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 2}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0976", "title": "Diagnosing DDP All-Reduce Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the root cause of this performance drop, and why are both proposed solutions counterproductive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0977", "title": "Evaluating MoE All-to-All Topology", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you spend $2M on 800 Gbps NDR or keep 400 Gbps and restrict EP=8 to intra-node NVLink groups, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0978", "title": "Diagnosing Stalled Computation During NCCL AllGather", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the hardware failing to overlap these operations?", "chain_ids": ["cloud-chain-auto-005-14"], "chain_positions": {"cloud-chain-auto-005-14": 1}, "chain_tiers": {"cloud-chain-auto-005-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0979", "title": "Evaluating Gradient Bucketing for Overlap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which gradient bucket size and CUDA stream configuration should you use for FSDP on 64 GPUs to maximize communication-computation overlap?", "chain_ids": ["cloud-chain-auto-005-14"], "chain_positions": {"cloud-chain-auto-005-14": 2}, "chain_tiers": {"cloud-chain-auto-005-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0980", "title": "Calculating Data Parallel Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Ring All-Reduce communication time per step and comm-to-compute ratio, and is the 16-node job communication-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0981", "title": "Data Parallelism on 10 Gbps Ethernet", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-step Ring All-Reduce communication and compute times, and is the 16-instance fine-tuning job communication-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0982", "title": "Evaluating Network Upgrades vs ZeRO-3 for LLM Scaling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 512-GPU run use 800 Gbps InfiniBand or ZeRO-3 on 400 Gbps, and how does each affect the comm/compute ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0983", "title": "Calculating Gradient Bucketing Overlap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If you partition the gradients into 4 equal-sized buckets to overlap the communication with the backward pass computation using CUDA streams, what is the new total time for the backward pass and synchronization, assuming perfect overlap and no scheduling overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0984", "title": "Diagnosing High Communication-Computation Ratio in 3D Parallelism", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 25% SM utilization, and how would you change TP/PP placement to avoid inter-node tensor-parallel AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0985", "title": "Evaluating Spatial Placement for Fused Attention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should Softmax run sequentially on the MXU or be spatially pipelined on the VPU, and what is the resulting block latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0986", "title": "Systolic Placement of Depthwise Convolutions", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does mapping the 3x3 depthwise convolution onto the 128x128 systolic array yield under 1% utilization, and where should it run instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0987", "title": "Computational Graph Node Fusion", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the total execution times before and after fusing the 5 element-wise graph nodes, and what speedup does fusion provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0988", "title": "Evaluating Operator Fusion in Computational Graphs", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you compile the graph to fuse LayerNorm, GeLU, and residual adds, and how much HBM traffic does fusion save for a 16MB activation?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 3}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0989", "title": "Wall-Clock Training Time Estimation for 175B Model", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many days will it take to train the 175B model on 1T tokens using 1,024 A100s at 45% MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0990", "title": "Weight vs Output Stationary Dataflow", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many weight-matrix element reads does WS versus OS require for this 256x256 tiled XW layer, and which placement is optimal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0991", "title": "Diagnosing Low MFU in LLM Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the primary bottleneck behind the 35% MFU, and how would you change the pipeline schedule or micro-batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0992", "title": "Diagnosing OOM from Retained Graphs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing GPU memory to grow by 2.5GB per iteration, and how should the loss logging code be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0994", "title": "Profiling Network Upgrades in ViT Training", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did doubling the ViT-Huge cluster network bandwidth from 200Gbps to 400Gbps reduce step time by only about 2%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0995", "title": "Evaluating MFU vs Network Upgrades", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you spend 6 weeks raising MFU to 55% or upgrade the interconnect, and what training-time savings justify the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0996", "title": "LLM Dense Layer Profiling", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the [2048,4096] x [4096,4096] FP16 projection compute-bound or memory-bound on an A100, and what is its minimum latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0997", "title": "Diagnosing MLP Optimization Bottlenecks", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Will 2x weight compression speed up the ViT MLP layer, or is it compute-bound under the Roofline model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0998", "title": "Evaluating Upgrades for High-Intensity GEMMs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you upgrade to A100 80GB memory bandwidth or use 2:4 sparsity for the GEMM SLA miss, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0999", "title": "CXL vs InfiniBand for DLRM Embeddings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the average embedding lookup latencies for two-node InfiniBand sharding versus single-node CXL expansion, and which is lower?", "chain_ids": ["cloud-chain-auto-008-11"], "chain_positions": {"cloud-chain-auto-008-11": 0}, "chain_tiers": {"cloud-chain-auto-008-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1000", "title": "Evaluating CXL Memory Pooling for DLRM", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 10TB DLRM embeddings, should you choose RDMA remote memory or rack-scale CXL 3.0 pooling for random lookups, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1001", "title": "DLRM Bottleneck with CXL Pooling", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did CXL 2.0 memory pooling hurt embedding lookup tail latency despite low bandwidth use, and how should embeddings be tiered?", "chain_ids": ["cloud-chain-auto-008-11"], "chain_positions": {"cloud-chain-auto-008-11": 1}, "chain_tiers": {"cloud-chain-auto-008-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1002", "title": "Diagnosing Silent Failures in CTR Prediction", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this silent model failure and architect a mitigation strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1003", "title": "Financial Impact of Drift Mitigation", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the net daily financial impact of moving to daily retraining if it recovers the CTR to 4.5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1004", "title": "Mitigating Concept Drift in High-Throughput Fraud Detection", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the anti-fraud system use 5-minute online learning or 6-hour micro-batch retraining for stable P(X) but falling precision, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1005", "title": "Evaluating OOD Rejection Under Strict Latency", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which calibration strategy should the MRI API use to abstain on OOD scanner artifacts under the 200ms P99 latency budget?", "chain_ids": ["cloud-chain-auto-004-10"], "chain_positions": {"cloud-chain-auto-004-10": 2}, "chain_tiers": {"cloud-chain-auto-004-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1006", "title": "Debugging RoCEv2 Congestion Spreading in a Clos Network", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can Job A's RoCEv2 incast and PFC pauses stall Job B in other racks, and what mitigations would you apply?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 1}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1007", "title": "Incast-Driven PFC Congestion Spreading Time", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 4:1 incast take to trigger PFC at the 4MB threshold, and how does that affect unrelated Spine traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1008", "title": "Diagnosing OOD Overconfidence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this overconfidence and what calibration technique resolves it without requiring a full retraining cycle or violating the latency constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1010", "title": "RoCEv2 Fabric-Wide PFC Congestion Spreading", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What mechanisms drove the PFC pause storm, and how would DCQCN with lower ECN thresholds contain it?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 2}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1011", "title": "Spot Preemption Batch Adjustment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you adjust your data loading and training loop parameters to resume training without altering the mathematical optimization dynamics?", "chain_ids": ["cloud-chain-auto-005-03"], "chain_positions": {"cloud-chain-auto-005-03": 0}, "chain_tiers": {"cloud-chain-auto-005-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1012", "title": "Diagnosing Loss Spikes During Elastic Training Scaling", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did elastic scaling from 128 to 256 GPUs cause divergence, and how should you adjust per-GPU batch size to keep GBS constant?", "chain_ids": ["cloud-chain-auto-005-03"], "chain_positions": {"cloud-chain-auto-005-03": 1}, "chain_tiers": {"cloud-chain-auto-005-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1013", "title": "Elastic Scale-Down with Constant Global Batch Size", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "When the fleet scales from 64 to 16 nodes with GBS locked at 2048, how should you adjust micro-batch and accumulation under 40GB VRAM?", "chain_ids": ["cloud-chain-auto-005-03"], "chain_positions": {"cloud-chain-auto-005-03": 2}, "chain_tiers": {"cloud-chain-auto-005-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1014", "title": "Diagnosing Static Preprocessing Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are these operations burning GPU cycles at runtime, and how do you eliminate this overhead without altering the numerical output?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 2}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1015", "title": "Quantifying Late-Stage Constraint Costs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the cost of discovering the T4 16GB constraint after training versus profiling it upfront, and what process change prevents it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1016", "title": "Calculating Wasted FLOPs from Unfolded Constants", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many FLOPs are wasted per inference by computing the fixed causal mask and frozen W_1 @ W_2 at runtime instead of constant folding them?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 1}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1017", "title": "Constant Folding Dense Normalization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much memory traffic does constant folding save for the 1M-feature normalization subgraph on an NVIDIA L4?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 3}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1018", "title": "LLM Deployment VRAM Constraint Propagation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What caused the OOM on the T4s despite INT8 weights, and how should the serving constraints have propagated back into architecture design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1019", "title": "Mitigating Late-Stage Hardware Constraints", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is post-training 4-bit quantization and pruning a viable salvage strategy for the 70B FP16 fraud model on a single 24GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1020", "title": "In-Place Rolling Deployment VRAM Saturation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the in-place zero-downtime rollout of two 15GB DLRM models on 16GB GPUs spike P99 latency, and what architecture fixes it?", "chain_ids": ["cloud-chain-auto-001-02"], "chain_positions": {"cloud-chain-auto-001-02": 2}, "chain_tiers": {"cloud-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1021", "title": "Canary Rollout Time Estimation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much total time is required to reach 100% traffic at 400 RPS under the 5,000-request and 10-minute bake rules?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1022", "title": "Continuous Deployment VRAM Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which deployment strategy should you use for 15GB CTR model updates on 24GB L4s to avoid OOMs and P99 latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1023", "title": "Continuous Training Frequency Optimization", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "To maximize net profit, how many days should you wait between retraining runs?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 1}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1024", "title": "Seasonality-Induced Drift Triggers", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the likely root cause of the hyper-active retraining, and how do you prove it?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 2}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1025", "title": "CNN Layer Compute and Memory Sizing", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For one 1024x1024x3 tile, what are the MACs and FP16 output feature-map footprint of the 7x7 stride-2, 64-channel convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1026", "title": "Continuous Fine-Tuning vs From-Scratch Retraining", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural and modeling risks are introduced by daily CFT, and how do you quantitatively justify the decision?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 3}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1027", "title": "Evaluating 3D vs Factorized Convolutions for Video", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you retain the standard 3D ResNet on 16GB T4s or replace it with factorized (2D spatial + 1D temporal) convolutions, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1028", "title": "Diagnosing Depthwise Conv Underutilization", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do MobileNet depthwise convolutions show under 20% SM utilization on A100 even at batch size 256?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1029", "title": "Diagnosing Pipeline Calibration Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did Platt scaling make CTR calibration look normal while overall ad revenue fell 15% after retrieval recall increased?", "chain_ids": ["cloud-chain-auto-004-14"], "chain_positions": {"cloud-chain-auto-004-14": 1}, "chain_tiers": {"cloud-chain-auto-004-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1030", "title": "Hidden Correction Cascades in Bidding Pipelines", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the over-bid per impression and daily loss from the stale 1.25x CTR correction, and how should you prevent this cascade?", "chain_ids": ["cloud-chain-auto-004-14"], "chain_positions": {"cloud-chain-auto-004-14": 0}, "chain_tiers": {"cloud-chain-auto-004-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1031", "title": "Diagnosing Correlated GPU Node Failures in a Cluster", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What root cause explains exactly 16 nodes dropping simultaneously in the 512-GPU training cluster?", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 2}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1032", "title": "Mitigating Correlated Rack Failures in 2048-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the trade-offs between node-local NVMe checkpointing and cross-rack asynchronous checkpointing, considering the impact of a correlated failure on training progress and cluster utilization.", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 3}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1033", "title": "Financial Impact of Counterfeit Transceivers", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the expected additional monthly downtime cost caused by a 15% counterfeit rate among the 256 optical transceivers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1034", "title": "Evaluating Pipeline Correction Cascades", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you accept the fallback-ad and CTR temperature-scaling hotfixes, or fix the CG model directly, and why?", "chain_ids": ["cloud-chain-auto-004-14"], "chain_positions": {"cloud-chain-auto-004-14": 2}, "chain_tiers": {"cloud-chain-auto-004-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1035", "title": "Evaluating Grey-Market GPU Fleet Reliability", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you integrate the unauthorized baseboards or delay 6 months for verified hardware, given 15% ECC errors and throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1037", "title": "Diagnosing Anomalous SDC and ECC Errors in GPU Clusters", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What likely explains the refurbished A100 ECC, SDC, thermal, and firmware anomalies, and what definitive diagnostic would you run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1038", "title": "Diagnosing Covariate Shift in Fraud Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Is the fraud model suffering concept drift or covariate shift, and how would you diagnose and adapt without waiting 4 weeks for labels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1039", "title": "Rack-Level Correlated Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the 14-day job failure probability for packing 64 nodes into 2 racks versus spreading them across 8 racks?", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 1}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1040", "title": "Evaluating Covariate Shift Mitigation in CTR Prediction", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a PSI of 0.35 with unchanged P(Y|X), should you fully retrain the CTR model or use importance weighting, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1041", "title": "Calculating Latency Impact of CPU Affinity", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you use CPU affinity and NUMA binding to bring 4-thread BERT requests under the 15ms SLA?", "chain_ids": ["cloud-chain-auto-011-05"], "chain_positions": {"cloud-chain-auto-011-05": 0}, "chain_tiers": {"cloud-chain-auto-011-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1042", "title": "Evaluating CPU Pinning for P99 Latency SLA", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt strict CPU affinity on the dual-socket EPYC inference fleet despite possible throughput loss, and why?", "chain_ids": ["cloud-chain-auto-011-05"], "chain_positions": {"cloud-chain-auto-011-05": 2}, "chain_tiers": {"cloud-chain-auto-011-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1043", "title": "Diagnosing AVX-512 Remainder Loop Bottlenecks", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the vectorization efficiency so low, and how do you root-cause and fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1044", "title": "Evaluating AVX-512 VNNI for Custom Kernels", "topic": "extreme-quantization", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you hand-write AVX-512 VNNI intrinsics for the custom INT8 attention kernel or refactor to oneDNN-backed operators, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1045", "title": "Calculate AVX-512 INT8 Peak Throughput", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical peak INT8 MAC throughput per clock cycle for one Ice Lake core using AVX-512 VNNI and 2 FMA units?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1046", "title": "Diagnosing P99 Latency Jitter on CPU Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and resolve this performance jitter?", "chain_ids": ["cloud-chain-auto-011-05"], "chain_positions": {"cloud-chain-auto-011-05": 1}, "chain_tiers": {"cloud-chain-auto-011-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1047", "title": "Calculating Credit Assignment Compute Requirements", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long would one optimization step take with finite differences versus standard reverse-mode autodiff for the 1.5B-parameter Transformer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1048", "title": "Diagnosing Early-Layer Credit Assignment Failure", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why have gradients collapsed in the early layers of the Post-LayerNorm Transformer, and what architecture change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1049", "title": "Diagnosing InfiniBand Scaling Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are millions of 10KB AllReduce operations scaling poorly on 400Gbps InfiniBand, and is upgrading to 800Gbps the right fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1050", "title": "Gradient Synchronization Critical Message Size", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What critical message size makes payload transfer time equal the 2µs startup latency on a 400Gbps RoCEv2 network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1051", "title": "CUDA Graphs for Low-Latency Inference", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the end-to-end latency before and after CUDA Graphs for 250 kernels with 4µs GPU time and 5µs launch overhead?", "chain_ids": ["cloud-chain-auto-005-05"], "chain_positions": {"cloud-chain-auto-005-05": 0}, "chain_tiers": {"cloud-chain-auto-005-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1052", "title": "Evaluating Fabrics via Critical Message Size", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which fabric minimizes communication for 1MB updates, what are their critical message sizes, and when should you switch to Fabric A?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1053", "title": "CUDA Graphs vs Kernel Fusion for LLM Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate these two optimization options to hit the 50ms SLA?", "chain_ids": ["cloud-chain-auto-005-05"], "chain_positions": {"cloud-chain-auto-005-05": 2}, "chain_tiers": {"cloud-chain-auto-005-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1054", "title": "Analyzing High CPU Utilization in LLM Inference", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the H100 idle 40% during Llama-3 8B decode with 3µs kernels, and how would you mitigate it?", "chain_ids": ["cloud-chain-auto-005-05"], "chain_positions": {"cloud-chain-auto-005-05": 1}, "chain_tiers": {"cloud-chain-auto-005-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1055", "title": "Evaluating Unbounded Credit Assignment Strategies", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use TBPTT with a 2,048-step window or gradient checkpointing over 32,768 steps, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1056", "title": "Diagnosing H2D Transfer Overlap Failures", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are H2D copies and compute still sequential despite pin_memory=True and non_blocking=True, and how do you overlap them?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1057", "title": "Evaluating Multi-Stream Overlap for Inference Pipelines", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What throughput speedup can a 3-stream H2D/compute/D2H pipeline deliver, and what constraints could prevent reaching it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1058", "title": "Diagnosing Custom ASIC Underutilization with Dynamic Shapes", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the custom ASIC show 3x higher P99 latency at 25% utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1059", "title": "NCCL AllReduce vs Backward Compute Overlap on a 4-GPU T4 Box", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "On 4x T4s with no NVLink and a 2 GB FP32 gradient over PCIe, what does bucketed AllReduce overlap save versus a post-backward single AllReduce, and where do bucket-size returns diminish?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1060", "title": "Custom ASIC TCO Break-Even Evaluation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At what GPU-equivalent deployment scale does the custom ASIC break even, and what architectural risk must justify it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1061", "title": "Sizing an NER-based PII Redaction Fleet", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GPUs are needed to handle 10,000 RPS of 256-token DistilBERT PII anonymization at 50% utilization given a 125 TFLOPS capacity per GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1062", "title": "CPU Bottleneck in PII Anonymization", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 2.5s P99 latency spike despite 45% GPU utilization, and how should the DAL be deployed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1063", "title": "ASIC TCO Break-Even Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many ASICs are required to break even on the $50M NRE after normalizing for QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1065", "title": "Evaluating Real-World Generalization vs Public Benchmarks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which model should you deploy for the 100,000/day medical imaging API, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1066", "title": "Diagnosing Production Accuracy Collapse in Cloud CV", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is production accuracy 72% despite 94% offline accuracy, and how would you recover it?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 1}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1067", "title": "Evaluating Local vs Managed Data Anonymization Layers at Scale", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use the managed DLP API or local BERT-Large DAL for 100 RPS, and what are the latency, utilization, and cost impacts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1068", "title": "Diagnosing Silent Feature Corruption", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the FPR spike after retraining, and why did standard validation metrics fail to catch it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1069", "title": "Upstream Cleaning vs Downstream Robustness", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you absorb the 2% label noise downstream or build the 100,000 CPU-hour cleaning pipeline, and what is the cost trade-off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1070", "title": "Compounding Costs of Data Cascades", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the production-stage data cascade cost versus the wasted 64-A100 training compute cost?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 0}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1072", "title": "Calculating Transfer Time for a 20PB Dataset", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical minimum transfer time, and what architecture circumvents this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1073", "title": "Evaluating Data-Centric Upgrades vs Model Scaling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade to ViT-Large or mine hard negatives to retrain ResNet-50, given the 50ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1074", "title": "Data Lineage Overhead at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What daily lineage storage overhead and ingest bandwidth does 50,000 TPS create, and how should you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1075", "title": "Foundation Model Data Gravity Trade-offs", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you stream the 20 PB dataset over dual 100 Gbps links or migrate it to S3 before training on 4,096 H100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1076", "title": "Cost of Noisy Data Scaling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the total data-plus-compute costs for 50M noisy images versus 5M cleaned images, and which path is cheaper?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1077", "title": "Diagnosing Pipeline Drops via Data Lineage", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you utilize data lineage tracking to analyze the provenance of this feature and root-cause the regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1078", "title": "Evaluating Inline vs Out-of-Band Data Lineage at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use inline provenance metadata or an asynchronous lineage registry at 50,000 TPS, and what are the bandwidth and latency trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1079", "title": "Storage Cost of Naive Data Lineage", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the 30-day S3 Standard storage cost ($0.023/GB-month) of full-copy lineage versus pointer-based lineage for the 5TB daily log pipelines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1080", "title": "Row-Level vs Partition-Level Lineage", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 10TB LLM corpus use row-level UUID lineage or partition-level hashes and artifact URIs, and why?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 2}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1081", "title": "Diagnosing Mutable Lineage Failures", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the reconstructed feature distributions mismatch the MLflow stats, and what lineage change ensures reproducibility?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 1}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1082", "title": "Local vs Cloud Inference Offloading", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Should inference run locally or in the cloud for each 50GB batch, and what are the total processing times?", "chain_ids": ["cloud-chain-auto-002-07"], "chain_positions": {"cloud-chain-auto-002-07": 0}, "chain_tiers": {"cloud-chain-auto-002-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1083", "title": "Cross-Environment Batch Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the GPUs at only 2.5% utilization on the 5 PB genomic job, and what architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1084", "title": "Cross-Region Training vs Data Locality", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 100 TB daily trading logs be trained locally in Frankfurt or transferred to New York, and what is the end-to-end time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1085", "title": "Ring-AllReduce Communication Overhead Calculation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Assuming a standard Ring-AllReduce implementation and no compute overlap, what is the exact network communication time per step?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 2}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1087", "title": "Gradient Clipping vs Data Poisoning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many poisoned samples appear per global batch, and how does per-sample clipping at C=1.0 change their relative gradient contribution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1088", "title": "Diagnosing DDP Network Bottlenecks", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Under hierarchical AllReduce, why does utilization drop from 98% to 42% across 4 nodes, and what software-only fix improves scaling?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 3}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1089", "title": "Evaluating Defenses for LLM Continuous Pre-training Poisoning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which backdoor defense should you choose considering security guarantees and compute economics?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1090", "title": "Diagnosing Targeted Data Poisoning in LLM Fine-Tuning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you determine whether the refund-intent drop is targeted data poisoning or concept drift, and isolate the corrupted training records?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1091", "title": "S3 Small File GPU Starvation Diagnosis", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is bottlenecking the S3 dataloader, and how should the image dataset be laid out to restore GPU utilization?", "chain_ids": ["cloud-chain-auto-011-13"], "chain_positions": {"cloud-chain-auto-011-13": 1}, "chain_tiers": {"cloud-chain-auto-011-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1092", "title": "Optimizing S3 Throughput with Data Sharding", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What single-thread throughput do 100KB S3 reads achieve versus 500MB shards, and how does sharding fix the bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1093", "title": "Calculating Data Stall Ratio for Vision Transformers", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Assuming perfect asynchronous data prefetching where computation and I/O overlap entirely, what is the Data Stall Ratio for the accelerators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1094", "title": "Evaluating Storage Sharding vs Caching for Multimodal Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you deploy the 125TB Redis cache or convert the 500M samples into 1GB WebDataset shards, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1095", "title": "Mitigating Data Stalls in Distributed Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade the central file system to 300 GB/s or deploy local NVMe caches, and what is the quantitative justification?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1096", "title": "Lustre Parallel File Striping Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this severe data loading bottleneck, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1097", "title": "Checkpoint Loading via Data Striping", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum stripe count is needed to load the 2.5TB checkpoint in 20 seconds from 200Gbps storage nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1098", "title": "Optimizing Stripe Size for 4 TB/s AI Storage", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What stripe size and number of storage targets would you choose for 2 GB TFRecord files to hit 4 TB/s without excess metadata overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1099", "title": "Diagnosing ViT Data Pipeline Stalls", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the data stall ratio, and is the 30 ms gap caused by network I/O or CPU preprocessing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1100", "title": "AOT Dataset Compilation for H100 GPU Clusters", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the network and CPU requirements to determine if you should use JIT decoding or AOT dataset compilation to feed the 32,000 images/sec.", "chain_ids": ["cloud-chain-auto-secondary-005-05"], "chain_positions": {"cloud-chain-auto-secondary-005-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1101", "title": "Evaluating WS vs OS Dataflows for LLM Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which dataflow should the 256x256 array use for the 8192x8192 layer in prefill versus batch-1 decoding, and what HBM traffic drives the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1102", "title": "Diagnosing GEMM Memory Bandwidth Bottlenecks", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the root cause and optimize the dataflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1103", "title": "Diagnosing Data Loader Bottlenecks in Vision Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze and resolve this data pipeline bottleneck using a dataset compilation approach?", "chain_ids": ["cloud-chain-auto-secondary-005-05"], "chain_positions": {"cloud-chain-auto-secondary-005-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1104", "title": "Weight-Stationary Tiling DRAM Bandwidth", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GB of DRAM reads are required for activation matrix A under the TPU's 32MB weight-stationary tiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1105", "title": "Evaluating Data Compilation Strategies", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade to 192-core CPU nodes or use offline dataset compilation for the 64-GPU image pipeline, and why?", "chain_ids": ["cloud-chain-auto-secondary-005-05"], "chain_positions": {"cloud-chain-auto-secondary-005-05": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1106", "title": "Demographic Data Scaling Shortfall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If you rely on naive random data collection from a pipeline where Demographic A has a 1% true occurrence rate, how many raw images must you ingest to close the gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1107", "title": "Diagnosing Regional Performance Degradation via Datasheets", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you use Datasheets for Datasets to perform a root-cause analysis of this failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1109", "title": "Diagnosing PFC Storms in DCQCN", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze why DCQCN is failing to prevent PFC triggering during incast microbursts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1110", "title": "Trade-offs of DCQCN Parameters in RoCEv2 AI Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you focus on aggressive DCQCN tuning or switch buffer/PFC threshold tuning for the MoE incast congestion, and why?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 2}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1111", "title": "DLQ Storage and Reprocessing Provisioning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What DLQ storage is required per 24-hour period, and what repair-job throughput is needed to drain a 24-hour backlog in a 4-hour window?", "chain_ids": ["cloud-chain-auto-004-12"], "chain_positions": {"cloud-chain-auto-004-12": 0}, "chain_tiers": {"cloud-chain-auto-004-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1112", "title": "Diagnosing Poison Pill Bottlenecks in Streaming Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you root-cause this blockage and re-architect the pipeline to restore throughput without losing the corrupted transaction records?", "chain_ids": ["cloud-chain-auto-004-12"], "chain_positions": {"cloud-chain-auto-004-12": 1}, "chain_tiers": {"cloud-chain-auto-004-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1113", "title": "Evaluating DLQ Architectures for High-Throughput Streams", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which DLQ strategy—synchronous Postgres, asynchronous Kafka, or stdout logging—meets the 5-second freshness SLA, and why?", "chain_ids": ["cloud-chain-auto-004-12"], "chain_positions": {"cloud-chain-auto-004-12": 2}, "chain_tiers": {"cloud-chain-auto-004-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1114", "title": "Declarative Autoscaling Calculation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many L4 GPU replicas will KServe provision for 2,500 RPS at 60 ms latency with concurrency 16 and 75% target utilization?", "chain_ids": ["cloud-chain-auto-001-15"], "chain_positions": {"cloud-chain-auto-001-15": 0}, "chain_tiers": {"cloud-chain-auto-001-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1115", "title": "Evaluating Declarative API Sidecar Overheads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does declarative scale-to-zero justify the per-pod sidecar overhead for 500 low-traffic models, and what trade-off gates adoption?", "chain_ids": ["cloud-chain-auto-001-15"], "chain_positions": {"cloud-chain-auto-001-15": 2}, "chain_tiers": {"cloud-chain-auto-001-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1116", "title": "DCQCN Incast Buffer Sizing", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What peak switch buffer occupancy occurs before DCQCN takes effect, and what minimum buffer size avoids PFC pauses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1117", "title": "Silent Saturation in Declarative Autoscaling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the CPU-based declarative autoscaler fail at 600 QPS, and what metric should drive scaling instead?", "chain_ids": ["cloud-chain-auto-001-15"], "chain_positions": {"cloud-chain-auto-001-15": 1}, "chain_tiers": {"cloud-chain-auto-001-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1118", "title": "Control Loop Reconciliations for Gang Scheduling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many reconciliation cycles will observe drift before the 3 autoscaled nodes are ready after 240 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1119", "title": "Control Loop Thrashing in Gang Scheduling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What flaw causes the 32-GPU job's control-plane thrashing, what is the impact, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1120", "title": "Evaluating Declarative Scheduling for GPU Fault Tolerance", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What MTTR improvement does declarative scheduling provide over imperative scripts under 15% hourly churn, and what control-plane risk remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1122", "title": "Evaluating Demographic Parity Trade-offs in Cloud Resume Screening", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What happens to Precision and Equal Opportunity if you enforce strict Demographic Parity despite Group M and Group N having 25% vs 10% base rates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1123", "title": "Multi-Core Power Trade-offs Post-Dennard Scaling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which 300W CPU option provides better theoretical throughput for parallel ML workloads like embedding lookups, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1124", "title": "Memory Scaling of Widened Dense Layers", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much additional FP32 GPU memory is needed when the dense layer grows from 8192x8192 to 32768x32768 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1126", "title": "Diagnosing Thermal Throttling in Multi-Core CPU Migrations", "topic": "mlops-lifecycle", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 64-core 2.4 GHz instance thermal throttle despite lower per-core frequency, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1127", "title": "Power Limits in Post-Dennard Era", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the expected total power consumption and theoretical throughput of the 8-PE 1.0 GHz design relative to the 100W 2.0 GHz baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1128", "title": "Evaluating Dense Scaling Bottlenecks in Recommendation Systems", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you replace the dense CTR MLP layers with rank-256 low-rank factorizations, and what are the bandwidth and capacity trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1129", "title": "Diagnosing Low Tensor Core Utilization in Cloud MLPs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is this happening and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1130", "title": "Versioned Embedding Cache Sizing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What peak Redis memory is required to dual-serve 100M V1 768-d and V2 1024-d FP16 embeddings with 20% overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1131", "title": "Transformer FFN FLOPs Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many FLOPs does the [2048,8192] x [8192,32768] FFN forward pass require, and what is its FP16 lower-bound latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1132", "title": "Evaluating GPU Upgrades vs Software Batching for MLPs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Will upgrading from A100 to H100 deliver a >3x latency reduction for batch-1 FP16 inference, or should you implement continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1133", "title": "Diagnosing Downstream Degradation in Cascaded Models", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the silent 12% CTR drop after the upstream BERT embedding update, and how would a dependency-aware registry prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1134", "title": "Intra-Rack 400G Cabling Power and Cost", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 32 intra-rack 400G links under 2.5 m, should you use passive DAC or AOC, and what cost and power differences drive the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1135", "title": "Multi-Stage Embedding Model Migration", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you safely migrate from the 2048-d v1 embedding model to the 768-d v2 model while managing the temporary GPU cost surge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1136", "title": "400G DAC Link Degradation in Dense Racks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely physical root cause, and how do you systematically diagnose and mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1137", "title": "Diagnosing Hidden Disparities in API Performance", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze the inference logs to diagnose the root cause of these targeted failures despite healthy aggregate metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1138", "title": "Intra-Rack 400G Interconnect Trade-offs", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 2m intra-rack 400G links, would you use passive DACs or AOCs, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1139", "title": "Subgroup Accuracy Disparities", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minority-group accuracy and error-rate disparity does disaggregated evaluation reveal, and how should that affect deployment?", "chain_ids": ["cloud-chain-auto-003-16"], "chain_positions": {"cloud-chain-auto-003-16": 0}, "chain_tiers": {"cloud-chain-auto-003-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1140", "title": "eKYC Disaggregated Evaluation Strategy", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the 100,000-user random holdout insufficient to validate the 2% subgroup, and what sample size is mathematically required to bound their FRR variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1141", "title": "Evaluating Disaggregated Serving for LLMs", "topic": "compound-ai-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much TTFT does transferring the 8K-prompt KV cache add over 200 Gbps, and is disaggregated serving viable for a strict 500ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1142", "title": "KV Cache Transfer in Disaggregated Serving", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What end-to-end TTFT does the user see after prefill, 2GB KV transfer over 200 Gbps, and first-token decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1143", "title": "Calculating Disparate Impact in Cloud Loan APIs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the Disparate Impact Ratio for Group B versus Group A, and does it violate the standard four-fifths (80%) fairness rule?", "chain_ids": ["cloud-chain-auto-003-07"], "chain_positions": {"cloud-chain-auto-003-07": 0}, "chain_tiers": {"cloud-chain-auto-003-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1145", "title": "Evaluating Disparate Impact in Cloud-Based Credit Scoring", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use threshold post-processing or adversarial debiasing to raise DIR to 0.80 under the 50ms P99 SLA?", "chain_ids": ["cloud-chain-auto-003-07"], "chain_positions": {"cloud-chain-auto-003-07": 2}, "chain_tiers": {"cloud-chain-auto-003-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1146", "title": "Calculating Dispatch Tax in Eager Mode", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the actual forward-pass latency and effective GPU utilization once PyTorch dispatch overhead is included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1147", "title": "Diagnosing Dispatch Overhead in Narrow Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of the 16% GPU utilization, and what optimization should you use instead of rewriting kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1148", "title": "Diagnosing Bottlenecks in Disaggregated LLM Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the decode-pool TPOT degradation and OOMs for 8192-token contexts over 100 Gbps Ethernet?", "chain_ids": ["cloud-chain-auto-006-02"], "chain_positions": {"cloud-chain-auto-006-02": 1}, "chain_tiers": {"cloud-chain-auto-006-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1149", "title": "CPU Dispatch Overhead in Eager Execution", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What per-operation dispatch tax does the profile imply, and what native PyTorch optimization would eliminate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1150", "title": "Mitigating Dispatch Overhead in Low-Latency Models", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you upgrade to H100s, use TorchInductor fusion, or wrap the TTS step in CUDA Graphs to reduce P99 latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1151", "title": "Evaluating Framework Execution Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you build a custom monolithic CUDA kernel or use a JIT graph compiler to meet the SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1152", "title": "Diagnosing Low GPU Utilization in Eager Mode GNNs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the GPU utilization so low, and how would you resolve this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1153", "title": "Centralized vs. Distributed Checkpointing Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long would centralized checkpointing take versus distributed checkpointing for the 2.1TB FSDP state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1154", "title": "Scaling Checkpoints for ZeRO-3", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you checkpoint the 2.1TB ZeRO-3 state to minimize pause time and avoid object-storage prefix throttling?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1155", "title": "Calculating the Financial Impact of Feature Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the daily financial impact and extra manual-review volume from the FPR rising from 2% to 8% at 500 TPS?", "chain_ids": ["cloud-chain-auto-003-10"], "chain_positions": {"cloud-chain-auto-003-10": 0}, "chain_tiers": {"cloud-chain-auto-003-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1156", "title": "Diagnosing Rank-0 Bottlenecks in Massive Model Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is rank-0 checkpointing taking over 20 minutes and timing out, and what checkpointing architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1157", "title": "Diagnosing Silent Model Degradation in E-Commerce Recommendations", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze this degradation to pinpoint the root cause of the silent failure?", "chain_ids": ["cloud-chain-auto-003-10"], "chain_positions": {"cloud-chain-auto-003-10": 1}, "chain_tiers": {"cloud-chain-auto-003-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1158", "title": "Mitigating Transient Covariate Shift in Recommendations", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 24-hour promo covariate shift, should you emergency-retrain on 4 hours of data or toggle the trending-items fallback?", "chain_ids": ["cloud-chain-auto-003-10"], "chain_positions": {"cloud-chain-auto-003-10": 2}, "chain_tiers": {"cloud-chain-auto-003-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1159", "title": "Evaluating Overparameterization and Double Descent in Vision Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Under what conditions is scaling to 150M parameters theoretically justified, and how do you reconcile it with the 50ms inference latency budget?", "chain_ids": ["cloud-chain-auto-005-13"], "chain_positions": {"cloud-chain-auto-005-13": 2}, "chain_tiers": {"cloud-chain-auto-005-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1160", "title": "Diagnosing the Interpolation Threshold Error Spike", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused validation error to spike at 1.2M parameters, and should the team revert and add dropout?", "chain_ids": ["cloud-chain-auto-005-13"], "chain_positions": {"cloud-chain-auto-005-13": 1}, "chain_tiers": {"cloud-chain-auto-005-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1161", "title": "Scaling Past the Interpolation Threshold", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What target parameter count reaches 10x the interpolation threshold, and does it fit the 2 GFLOP inference budget?", "chain_ids": ["cloud-chain-auto-005-13"], "chain_positions": {"cloud-chain-auto-005-13": 0}, "chain_tiers": {"cloud-chain-auto-005-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1162", "title": "Root-causing Deep MLP Training Collapse", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this stalled training, and how would you analyze the contribution of the learning rate schedule and initialization to this failure state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1163", "title": "Wasted Activation VRAM from Dying ReLUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much VRAM (in MB) is wasted storing zero activations for the backward pass across all 4 ReLU layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1164", "title": "Evaluating Activation Trade-offs for Dead Neurons", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you replace ReLU with GeLU or SwiGLU despite extra activation FLOPs, and what is the hardware trade-off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1165", "title": "Dynamic Batching Delay Calculation", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum max_batch_size guarantees the 50ms SLA when max_batch_delay must be 10ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1166", "title": "Debugging Dynamic Batching Latency Spikes", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 hit 130ms during bursts, and how should max_queue_delay be reconfigured?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 2}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1167", "title": "Rolling Window Dynamic Benchmarking Cost", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum rolling window N you can afford under the $1,500 monthly dynamic benchmarking budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1168", "title": "Evaluating Dynamic Benchmarks for Code Generation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design the daily dynamic benchmark, and why use hybrid LLM-as-judge plus 5% HITL instead of pure HITL?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1170", "title": "Dynamic Batching for Strict Latency SLOs", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What max_batch_size and max_queue_delay would you choose to keep serving under the 70ms server-side budget?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 3}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1171", "title": "Dynamic Batching Timeout Trade-offs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does max_batch_size 16 with a 60ms batch_timeout satisfy the 100ms P99 SLA, and what timeout would be safe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1172", "title": "GPU Voltage Scaling Dynamic Power Reduction", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What percentage reduction in dynamic power results from lowering voltage by 15% and frequency by 10%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1173", "title": "Dynamic Batching Wait Window Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What max dynamic batching wait window and batch size keep the translation API within the 200ms P99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1174", "title": "Evaluating DVFS for Cluster Power Capping", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use frequency-only throttling or DVFS to stay under the 20MW power limit while maximizing aggregate throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1175", "title": "Diagnosing Non-Linear Power Drops in GPU DVFS", "topic": "mlops-lifecycle", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did a 25% SM clock reduction cut dynamic power by nearly 58% instead of the projected 25%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1176", "title": "DVFS Power Cap for High-Density GPU Clusters", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What percentage reduction in clock frequency is needed to cap each GPU at 300W, and how much does theoretical throughput drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1177", "title": "Diagnosing P99 Latency Spikes in Dynamic Inference", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 latency 250ms with low GPU utilization, and what preprocessing and feature-fetch changes would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1179", "title": "Eager Execution Dispatch Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the CPU dispatch time, GPU compute time, maximum GPU utilization, and best fix for the 50,000 tiny matmuls per forward pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1180", "title": "Diagnosing Low GPU Utilization in Dynamic Dispatch", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the GPU starved, and how do you analyze and resolve this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1181", "title": "Eager vs Static Execution in Dynamic GNNs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate eager versus static execution for this dynamic GNN, and what production deployment strategy would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1182", "title": "Evaluating DVFS Power Capping for LLM Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which power-capping strategy should the 35kW H100 rack use, uniform DVFS downclocking or selectively idling GPUs, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1183", "title": "Diagnosing ECMP Hash Collisions in RoCEv2 Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What root cause explains the spine uplink imbalance, and what telemetry would confirm ECMP hash collisions?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 1}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1184", "title": "Evaluating ECMP for RoCEv2 GPU Training Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What risks does pure ECMP pose for 4,096-GPU synchronous LLM training, and what network architecture would you choose instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1185", "title": "Single-Flow ECMP Hashing Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the max uplink utilization, what routing behavior causes the 100Gbps cap, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1186", "title": "Calculating the Efficiency Frontier Trade-off", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum batch size and throughput does each model achieve under the 100ms SLA, and what throughput-mAP trade-off results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1187", "title": "LLM Concurrency and KV Cache Thrashing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did raising max concurrency from 64 to 256 spike p99 TTFT and reduce throughput on the 8xH100 70B service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1188", "title": "Evaluating the LLM Efficiency Frontier", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you downsize to an 8B FP16 model or run a W4A16 70B model on 2 A100s to cut cost by 50%, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1189", "title": "Spot Instance Preemption and Batch Scaling", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "After preemption to 48 GPUs, what are the new global batch size, learning rate, and steps per 1,228,800-image epoch?", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 1}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1190", "title": "Diagnosing TorchElastic Spot Preemption Stalls", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What causes the 15-minute TorchElastic stall after a node preemption, and how would you reduce it?", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 2}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1191", "title": "Evaluating Preemption Overheads in Elastic LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Assuming a 15-minute checkpoint interval for Framework A, which framework maximizes goodput over a 14-day run, and why?", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 3}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1192", "title": "Elephant Flow Collisions with ECMP", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why are some RoCEv2 ports saturated during the 130GB AllReduce while others idle, and how would you fix the routing?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 0}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1193", "title": "Diagnosing ECMP Hash Collisions", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1194", "title": "Evaluating ECMP vs Adaptive Routing for Elephant Flows", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you mitigate AllReduce ECMP collisions with switch Adaptive Routing or a Rail-Optimized topology, and why?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 2}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1195", "title": "Resource Exhaustion in ELT Feature Generation", "topic": "compound-ai-systems", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the resource exhaustion, and how should the transformation pipeline be restructured to handle this scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1198", "title": "DLRM Embedding Sharding Strategy", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you shard the 200GB user table and four 5GB tables across 8x80GB A100s to maximize DLRM throughput without OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1199", "title": "Carbon Footprint Calculation for GPU Clusters", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 3-year lifecycle carbon footprint for the 1,024-GPU cluster, and what percentage is embodied carbon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1200", "title": "DLRM Embedding Table Sizing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory does the 1B-by-128 FP32 Adam embedding table require, and how many 80GB GPUs are minimally needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1201", "title": "Diagnosing DLRM Load Imbalance", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this bottleneck and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1202", "title": "Analyzing Lifecycle Carbon in GPU Refresh Cycles", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the fundamental flaw in calling the renewable-powered cluster zero-carbon while refreshing GPUs every 1.5 years?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1203", "title": "Lifecycle Carbon Analysis of Hardware Refresh", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you evaluate the true carbon impact of this hardware refresh over a 3-year depreciation cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1204", "title": "GPU Energy Efficiency for Embedding Extraction", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much daily energy does Setup A versus Setup B use to process 100M documents, and which is more energy-efficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1205", "title": "Non-Linear Power Scaling in LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this non-linear scaling of power vs. throughput?", "chain_ids": ["cloud-chain-auto-secondary-009-16"], "chain_positions": {"cloud-chain-auto-secondary-009-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1206", "title": "Energy-Movement Invariant in Feature Pipelines", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which approach is more energy-efficient, and what is the approximate energy gap per request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1207", "title": "Evaluating GPU Energy Metrics under Diurnal Load", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the fundamental flaws in this methodology before approving a fleet-wide rollout for a service with highly diurnal traffic (peak 10k QPS, trough 1k QPS). What is flawed about comparing INT8 and FP16 by 60s nvidia-smi power alone, and what energy metric should guide rollout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1208", "title": "Evaluating Edge vs Centralized Processing for Telemetry Data", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you centralize 5 PB in us-east-1 for H100 training or use federated regional training, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1209", "title": "LLM Inference Energy Bottleneck", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 70B batch-1 inference token, how much energy goes to HBM reads versus FP16 compute, and which dominates?", "chain_ids": ["cloud-chain-auto-secondary-009-15"], "chain_positions": {"cloud-chain-auto-secondary-009-15": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1210", "title": "Diagnosing Cross-Region Training Energy Spikes", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the true energy bottleneck in the 50TB/day CTR pipeline, and why won't upgrading to H100s with FP8 fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1211", "title": "Evaluating Kernel Fusion Energy Trade-offs", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you accept this proposal, and how does the 20% FLOP penalty impact the overall energy consumption per token?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1212", "title": "Root-Causing Memory Power in LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 20B batch-1 decoding service draw 180W with only ~0.1W of ALU work, and how would you reduce it?", "chain_ids": ["cloud-chain-auto-secondary-009-15"], "chain_positions": {"cloud-chain-auto-secondary-009-15": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1213", "title": "Diagnosing High TDP in Low-Util Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do A100s hit 400W TDP at batch size 1 despite only 20% tensor-core utilization, and what mitigations reduce the draw?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1214", "title": "LLM Inference Energy Movement Bottleneck", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will a 15x INT8 MAC energy reduction meaningfully cut fleet power for batch-1 70B decoding with the same HBM, and why?", "chain_ids": ["cloud-chain-auto-secondary-009-15"], "chain_positions": {"cloud-chain-auto-secondary-009-15": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1215", "title": "Calculating Arithmetic Power for LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum active compute power required strictly for the arithmetic operations, ignoring memory access and static system leakage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1217", "title": "Resume Screening True Positive Rate Trade-offs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach should you use to achieve Equality of Opportunity for the resume API under the 100ms SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1218", "title": "Resume Screening API Equality of Opportunity Threshold Tuning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the current True Positive Rates for Groups A and B, and how many additional true positives does Group B need for Equality of Opportunity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1221", "title": "Erasure Coding Storage Footprint", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much raw storage does RS(10,4) require for 10 PB of data, and how many simultaneous drive failures per stripe can it tolerate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1222", "title": "Exabyte-Scale LLM Data Erasure Coding Migration", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you migrate the 200 PB active training dataset from 3-way replication to RS(10,4), and what throughput trade-offs drive the decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1223", "title": "Cluster MTBF from ECC Uncorrectable Errors", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the cluster MTBF from uncorrectable HBM errors, and is a 4-hour checkpoint interval acceptable?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 1}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1224", "title": "Diagnosing High Correctable ECC Error Rates", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can 10,000 correctable ECC errors/sec cause a 20% step-time slowdown, and how should it be addressed?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 2}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1225", "title": "Object Storage Tail Latency Spikes", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What storage-level mechanism causes these rare but severe tail latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1226", "title": "Randomized Thresholds for Equalized Odds", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What randomization probabilities over the two ROC operating points achieve Equalized Odds at exactly FPR=0.15 and TPR=0.75 for Groups A and B?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1227", "title": "Applying Error Feedback in Top-k Gradient Sparsification", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With error feedback, what is the next compensated gradient, will it be transmitted, and what FP16 residual memory overhead is required per GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1228", "title": "Evaluating HBM3 ECC overhead at 24k GPU scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should ECC be enabled or disabled on the 24,576-GPU H100 cluster, and how does the cluster MTBF affect the throughput trade-off?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 3}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1229", "title": "Diagnosing Top-k Gradient Divergence", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this divergence, and how do you fix it while maintaining the 99% compression ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1230", "title": "Evaluating Error Feedback Memory Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does error feedback cause OOM for the 20B model, and how would you fix it while preserving the communication reduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1231", "title": "Analyzing Silent Data Corruption Propagation in 3D Parallelism", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you trace a single-GPU FP16 silent data corruption through TP=8, PP=8, DP=32 to isolate the failing H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1232", "title": "SDC Propagation vs Checksum Trade-off", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 500B job, would you checksum every gradient sync or only checkpoint states, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1233", "title": "AllReduce Fault Propagation Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What total wall-clock time is needed to achieve 1,000 hours of effective training progress under this fault model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1234", "title": "Energy Savings with Event-Driven Activation", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What daily energy and dollar savings does the event-driven camera pipeline achieve after accounting for the always-on filter?", "chain_ids": ["cloud-chain-auto-015-06"], "chain_positions": {"cloud-chain-auto-015-06": 0}, "chain_tiers": {"cloud-chain-auto-015-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1235", "title": "Evaluating Event-Driven Activation for Cloud Video Analytics", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy vs. latency trade-offs of this design: under what hardware and data conditions does the overhead of the event-generation logic negate the overall power savings?", "chain_ids": ["cloud-chain-auto-015-06"], "chain_positions": {"cloud-chain-auto-015-06": 2}, "chain_tiers": {"cloud-chain-auto-015-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1236", "title": "Diagnosing GPU Power in Event-Driven SNNs", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 98%-sparse SNN still run GPUs at 100% utilization, and what execution change is needed for energy savings?", "chain_ids": ["cloud-chain-auto-015-06"], "chain_positions": {"cloud-chain-auto-015-06": 1}, "chain_tiers": {"cloud-chain-auto-015-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1237", "title": "Model Weight Exfiltration Timing", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How large is the FP16 7B model theft payload, how long does exfiltration at 100 Mbps take, and does it evade the 10 GB/10-min alarm?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1238", "title": "Diagnosing Model Exfiltration via HostPath", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 10 Gbps, 2-minute egress spike, and what vulnerability allowed the 70B model weights to be stolen?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1239", "title": "Evaluating Confidential Computing for Exact Model Theft", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture best prevents exact extraction of the 140GB model by an insider, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1240", "title": "Diagnosing H2D Transfer Serialization on A100 GPUs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is H2D transfer serialized with compute despite using pinned memory and `non_blocking=True`, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1241", "title": "Overlapping Transfer and Compute via Streams", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Assuming an effective Host-to-Device (H2D) transfer bandwidth of 32MB/ms, what total latency is required for 1,000 requests in a naive synchronous pipeline versus a fully pipelined pinned-memory CUDA stream design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1242", "title": "Evaluating Stream Overlap for H2D Transfers", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What speedup does overlapping the 10ms H2D transfer with 40ms compute provide, and what are the associated memory costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1243", "title": "Calculating Expected Calibration Error", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the Expected Calibration Error for the two confidence bins over the 10,000-sample test set?", "chain_ids": ["cloud-chain-auto-004-10"], "chain_positions": {"cloud-chain-auto-004-10": 0}, "chain_tiers": {"cloud-chain-auto-004-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1244", "title": "Diagnosing Overconfident Predictions in Cloud Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the 94%-accurate classifier failing confidence-based routing, and what low-latency post-hoc calibration fix would you apply?", "chain_ids": ["cloud-chain-auto-004-10"], "chain_positions": {"cloud-chain-auto-004-10": 1}, "chain_tiers": {"cloud-chain-auto-004-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1245", "title": "Evaluating Calibration Methods for High-Stakes Cloud APIs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which calibration method should you choose to reduce ECE below 0.05 within the 2ms SLA, and how does ECE guide the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1246", "title": "KV Cache External Fragmentation Constraints", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many new requests needing 150MB contiguous KV cache blocks can be admitted, and how much free memory is unusable fragmentation?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 1}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1247", "title": "Diagnosing KV Cache External Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose this discrepancy between reported free memory and the OOM failures, and how do you resolve it?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 2}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1248", "title": "Evaluating KV Cache Allocators for Variable Sequences", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use compacting garbage collection or PagedAttention-style allocation to fix KV cache fragmentation, and why?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 3}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1249", "title": "Calculating Optimal Checkpoints for Fail-Stop Errors", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using Young's formula ($T_{opt} = \\sqrt{2 \\times t_{save} \\times MTBF}$), what is the optimal checkpoint interval and daily checkpointing overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1250", "title": "Diagnosing NCCL Timeout in Fail-Stop Node Crashes", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the fail-stop node crash take 30 minutes to surface in NCCL, and how would you configure the fabric to detect it faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1252", "title": "Rack-Aware Capacity Provisioning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many total nodes are needed to survive one rack failure when spreading the 6-node baseline across 3 racks versus 4 racks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1253", "title": "Evaluating Topologies for Cross-Rack Redundancy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you uniformly spread active replicas across all 10 racks, or strictly isolate primary/replica clusters to specific PDU boundaries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1254", "title": "Correlated Rack-Level Training Failure", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the 128-node training job fail despite an 8-node redundancy buffer, and what topology flaw caused it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1255", "title": "Estimating Cluster Failure Frequency", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many node failures should the fault-tolerance system expect during the 30-day run?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 2}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1256", "title": "Evaluating Optimal Checkpoint Frequency at Scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What checkpoint interval maximizes training goodput for the 16,384-GPU cluster given 40,000-hour node MTBF and 5-minute saves?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 3}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1257", "title": "Large-Cluster MTBF and Goodput Analysis", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of the frequent interruptions and the core bottleneck affecting goodput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1258", "title": "Calculating Historical Fair-Share Priority", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the dynamic priority scores for Teams Vision and NLP, and which team receives the next 32 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1259", "title": "Fair-Share Scheduling Trade-offs for Bursty Workloads", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you reduce the decay half-life to 24 hours or add hierarchical quotas with node reservation, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1260", "title": "Diagnosing MMD Fairness Bottlenecks", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 150ms to 1200ms slowdown, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1261", "title": "Starved Urgent Jobs in Fair-Share Clusters", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Team X's 16-GPU debug job starve, and what scheduler change would support urgent debugging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1262", "title": "MMD Penalty Overhead in Fairness Training", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much FP32 memory and how many FLOPs does the 8192x8192 MMD distance matrix add per batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1263", "title": "Resolving Conflicting Fairness Metrics in Credit Scoring APIs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate these constraints and architect a viable deployment strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1264", "title": "Evaluating Adversarial Debiasing Trade-offs in Large-Scale Credit Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you keep adversarial debiasing, or switch to a lighter fairness intervention to hit DI ≥0.80 within 48 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1267", "title": "Sizing a Non-Blocking Fat-Tree Network", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 64-port switches are needed for a 1:1 non-blocking 2-tier fat-tree for 2,048 GPUs, split between leaf and spine?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 0}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1268", "title": "Diagnosing Bisection Bandwidth Drops in a GPU Cluster", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What routing-level architectural issue in the fat-tree network is causing this specific bottleneck?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 1}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1269", "title": "Non-blocking vs Oversubscribed Fat-Tree", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you choose a 1:1 or 2:1 oversubscribed fat-tree for 65,536-GPU MoE training, and why?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 2}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1270", "title": "Chaos Testing Distributed Training Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many faults should you inject, and what is the overall cluster availability percentage during this 100-hour test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1271", "title": "Diagnosing Deadlocks During Simulated GPU Fault Injection", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 1024-GPU job hang after the injected Xid 48 failure, and what NCCL logs and settings are needed to test recovery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1272", "title": "Continuous Fault Injection in Synchronous Distributed Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you inject random node failures every 2 hours in the 4,096-GPU production run, or test in staging, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1273", "title": "Cluster Hard Fault MTBF Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the expected cluster MTBF from hard HBM and transceiver faults for the 10,000-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1274", "title": "Diagnosing Co-tenant Voltage Fault Injections", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What attack explains the co-tenant-correlated misclassifications, and how does it bypass enclave memory isolation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1275", "title": "Evaluating Cloud Enclave Fault Injection Mitigations", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which mitigation should you choose against Plundervolt-style key leakage, and what are the performance, security, and TCO trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1276", "title": "Evaluating Fault Models for Large-Scale GPU Clusters", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 100 daily hardware errors all trigger fail-stop restarts, or should you use a multi-tier fault model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1277", "title": "Diagnosing Network Saturation in Sharded Embedding Lookups", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the frontend 100 Gbps NIC saturating, and how should DLRM batching change across the 8 embedding servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1278", "title": "Cloud FPGA Remote Voltage Fault Injection", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many API requests and dollars would the attacker need to extract the 4,000x1,000 dense layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1279", "title": "Evaluating Feature-Parallel Batching for DLRM", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you switch to feature-parallel batching or upgrade to 800 Gbps NICs for the 50,000 QPS DLRM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1280", "title": "Diagnosing Limp-Ware Fault Models in Synchronous Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What fault model explains the >40% throughput drops, and how can you turn the degraded PCIe link into an orchestrator-handled failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1281", "title": "Optimizing DLRM with Feature-Parallel Batching", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much CUDA launch overhead is saved by switching from request-parallel to feature-parallel batching for 512 requests and 80 features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1282", "title": "Calculating GPU Feeding Tax for Distributed Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the feeding tax when 8 GPUs demand 1.2 GB/s of images but the NAS supplies only 800 MB/s?", "chain_ids": ["cloud-chain-auto-003-19"], "chain_positions": {"cloud-chain-auto-003-19": 0}, "chain_tiers": {"cloud-chain-auto-003-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1283", "title": "Diagnosing GPU Starvation in Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the GPUs averaging only 40% utilization despite DataLoader tuning, and what I/O change is needed?", "chain_ids": ["cloud-chain-auto-003-19"], "chain_positions": {"cloud-chain-auto-003-19": 1}, "chain_tiers": {"cloud-chain-auto-003-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1284", "title": "Calculating HBM Traffic in Flash Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HBM traffic per attention head is saved by FlashAttention for N=8192, d=128 FP16 attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1285", "title": "Evaluating Storage Architecture to Eliminate CV Training Feeding Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Would you upgrade to 100 Gbps with WekaFS or add local NVMe caching with WebDataset, and why?", "chain_ids": ["cloud-chain-auto-003-19"], "chain_positions": {"cloud-chain-auto-003-19": 2}, "chain_tiers": {"cloud-chain-auto-003-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1286", "title": "Diagnosing SRAM Spills in Tiled Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do the Br=256 and Bc=256 attention tiles spill to HBM on A100, and what tile sizing fixes the bottleneck?", "chain_ids": ["cloud-chain-auto-014-01"], "chain_positions": {"cloud-chain-auto-014-01": 1}, "chain_tiers": {"cloud-chain-auto-014-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1287", "title": "Evaluating Flash Attention Arithmetic Intensity", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is standard 32K attention compute-bound on A100, or does Flash Attention improve wall-clock time by raising arithmetic intensity?", "chain_ids": ["cloud-chain-auto-014-01"], "chain_positions": {"cloud-chain-auto-014-01": 2}, "chain_tiers": {"cloud-chain-auto-014-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1288", "title": "HBM Bandwidth Savings with FlashAttention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much HBM traffic do S and P cause for one FP16 head at N=8192, and how much does FlashAttention reduce it?", "chain_ids": ["cloud-chain-auto-008-12"], "chain_positions": {"cloud-chain-auto-008-12": 0}, "chain_tiers": {"cloud-chain-auto-008-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1289", "title": "Diagnosing FlashAttention Recomputation Optimization", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would caching the NxN attention probabilities to save backward FLOPs degrade throughput despite 20GB free HBM?", "chain_ids": ["cloud-chain-auto-008-12"], "chain_positions": {"cloud-chain-auto-008-12": 1}, "chain_tiers": {"cloud-chain-auto-008-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1290", "title": "Evaluating FlashAttention vs Standard Attention for 32K Context", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use activation checkpointing with standard attention or adopt FlashAttention for 32K context, and why?", "chain_ids": ["cloud-chain-auto-008-12"], "chain_positions": {"cloud-chain-auto-008-12": 2}, "chain_tiers": {"cloud-chain-auto-008-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1291", "title": "Latency Overhead of RS-FEC in PAM4 Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total one-way end-to-end latency and the percentage of this latency introduced solely by FEC.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1292", "title": "Jacobian Computation for Low-Dimensional Inputs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which automatic differentiation mode should be used to compute the Jacobian, and how many passes will it require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1293", "title": "Diagnosing 400G PAM4 Latency Floors", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can you disable FEC to lower 400G RoCEv2 hop latency, and what causes the ~1.2μs latency floor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1294", "title": "Diagnosing Jacobian Bottlenecks in Many-Output Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do reverse-mode VJPs OOM for a 65,536x25 Jacobian, and which autodiff strategy should replace them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1295", "title": "Evaluating Forward-Mode AD for Jacobian Regularization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 500x5 PINN Jacobian be computed with forward-mode AD instead of reverse-mode, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1296", "title": "Distributed LLM Training Deadlock and Gang Scheduling", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many GPU-hours are wasted before NCCL times out, and what scheduling paradigm prevents this partial allocation?", "chain_ids": ["cloud-chain-auto-021-05"], "chain_positions": {"cloud-chain-auto-021-05": 0}, "chain_tiers": {"cloud-chain-auto-021-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1297", "title": "Evaluating FEC Trade-offs in 800G PAM4 Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should FEC be disabled on 2-meter 800G PAM4 DAC links to save 100–150 ns per hop, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1298", "title": "Diagnosing PyTorch DDP Partial Allocation Deadlocks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this resource fragmentation and resolve the cluster-wide deadlock?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1299", "title": "Evaluating Gang Scheduling for LLM Training", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use an all-or-nothing gang scheduler or app-level timeouts for these 16-node FSDP jobs, and what is the compute waste impact?", "chain_ids": ["cloud-chain-auto-021-05"], "chain_positions": {"cloud-chain-auto-021-05": 2}, "chain_tiers": {"cloud-chain-auto-021-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1300", "title": "Diagnosing CPU Inefficiency in Transformer Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are 112 CPU cores at 100% utilization achieving under 5% peak TFLOPs for batch-size=1 Transformer inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1301", "title": "Evaluating Accelerator Efficiency Overheads", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 10 TFLOP/500W AVX-512 CPU fleet far less power-efficient than the 200 TFLOP/400W accelerator for dense inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1302", "title": "Go-Back-N Congestion Penalty During AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With Go-Back-N and 2,000 unacknowledged 4KB packets, how much data is retransmitted after the single dropped packet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1303", "title": "Quantifying the CPU Generality Tax in Batch Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the pJ/FLOP for the Xeon and T4, and what annual power cost savings come from moving the 10 PFLOP/s workload to T4s at $0.10/kWh and 1.2 PUE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1304", "title": "Diagnosing RoCEv2 Throughput Collapse Under Minor Packet Loss", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would such a microscopic packet drop rate cause an 80% degradation in goodput, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1305", "title": "Go-Back-N Penalty in 800G RoCEv2 Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you accept the vendor's Go-Back-N RoCE NICs for 2GB AllReduce at 800Gbps with 0.1% packet loss, and what bandwidth penalty results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1306", "title": "GPU DALI Preprocessing Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If JPEG decoding moves to the GPU with DALI, what are the new per-image latency and PCIe transfer volume savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1307", "title": "CPU-Bound Image Preprocessing Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is bottlenecking the A100 image classification service at 400 RPS, and how should the preprocessing pipeline be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1308", "title": "Evaluate GPU Preprocessing for Video Analytics", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the video pipeline send 200KB compressed frames to the GPUs and decode with DALI instead of transferring 6.2MB raw frames, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1309", "title": "SLA Budgets Under Network Stress", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "During the outage, what is the heuristic model's maximum compute budget and how many CPU instances are needed for 10,000 RPS?", "chain_ids": ["cloud-chain-auto-secondary-015-19"], "chain_positions": {"cloud-chain-auto-secondary-015-19": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1311", "title": "Scaling Effective Batch Size via Accumulation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many gradient accumulation steps are required to reach a global batch size of 1024, and how should the loss be scaled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1312", "title": "Diagnosing DDP Overhead in Gradient Accumulation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is DDP utilization low during 32 gradient accumulation steps, and how do you fix the communication overhead?", "chain_ids": ["cloud-chain-auto-005-15"], "chain_positions": {"cloud-chain-auto-005-15": 1}, "chain_tiers": {"cloud-chain-auto-005-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1313", "title": "Evaluating Extreme Gradient Accumulation Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the compute, memory, and communication trade-offs of using 8-step gradient accumulation for the 3B LLM on 8x A100s?", "chain_ids": ["cloud-chain-auto-005-15"], "chain_positions": {"cloud-chain-auto-005-15": 2}, "chain_tiers": {"cloud-chain-auto-005-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1314", "title": "Degrading DLRM Ranking Under Capacity Loss", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you gracefully degrade the recommendation API after losing 60% of T4 capacity while keeping P99 under 200ms?", "chain_ids": ["cloud-chain-auto-secondary-015-19"], "chain_positions": {"cloud-chain-auto-secondary-015-19": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1315", "title": "Gradient Sparsification Compute Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did Top-1% gradient sparsification make the 10B model's step time over 2 seconds slower despite reducing payload to 200MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1316", "title": "Mitigating Gradient Inversion in Medical Federated Learning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do batch-size-1 federated gradients allow exact X-ray reconstruction, and what mitigation prevents it with under 1% accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1317", "title": "Gradient Quantization vs Sparsification", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 10B model on 64 GPUs, should you deploy Top-1% sparsification or INT8 gradient quantization, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1318", "title": "Analytical Gradient Inversion Complexity", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For batch size 1, how many FLOPs are needed to reconstruct the 4,096-dimensional input from dW and db?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1321", "title": "Ring All-Reduce Data Volume Calculation", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much gradient data does each GPU transmit per Ring AllReduce step for the 30B BF16 model on 64 GPUs, and how long does it take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1322", "title": "Local Gradient Clipping Divergence", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does clipping gradients locally before AllReduce make the 13B DDP run diverge, and what is the correct ordering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1323", "title": "Evaluating Sync vs Async Gradient Strategies", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you switch the 7B LLM training job to Async Parameter Servers or keep Synchronous AllReduce with 8-bit compression, and why?", "chain_ids": ["cloud-chain-auto-023-14"], "chain_positions": {"cloud-chain-auto-023-14": 1}, "chain_tiers": {"cloud-chain-auto-023-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1325", "title": "Diagnosing torch.compile Graph Breaks", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did adding a data-dependent if statement to the torch.compile Transformer collapse throughput from 3,500 to 800 tokens/sec, and how should you fix it?", "chain_ids": ["cloud-chain-auto-023-05"], "chain_positions": {"cloud-chain-auto-023-05": 1}, "chain_tiers": {"cloud-chain-auto-023-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1326", "title": "Evaluating MoE Routing Graph Breaks in PyTorch 2.x", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you pad MoE expert capacities to static sizes or retain dynamic routing to achieve predictable sub-30ms latency with torch.compile?", "chain_ids": ["cloud-chain-auto-023-05"], "chain_positions": {"cloud-chain-auto-023-05": 2}, "chain_tiers": {"cloud-chain-auto-023-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1329", "title": "Estimating Graph Fusion Bandwidth Savings", "topic": "graph-compilation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HBM traffic and latency does compiler fusion save per attention block?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 1}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1330", "title": "Diagnosing Frequent Recompilations in Dynamic Transformers", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze and resolve the root cause of these latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1331", "title": "Calculating Latency Overhead of PyTorch Graph Breaks", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 3 graph breaks adding 3ms each, what is the new batch latency and how does it compare to the 20ms eager baseline?", "chain_ids": ["cloud-chain-auto-023-05"], "chain_positions": {"cloud-chain-auto-023-05": 0}, "chain_tiers": {"cloud-chain-auto-023-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1333", "title": "Diagnosing Data-Dependent Control Flow Failures in JIT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did tracing make the seq2seq model always execute 32 decoding steps, and how does Graph Scripting fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1334", "title": "Evaluating Graph Scripting vs Tracing for Dynamic Control Flow", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the Adaptive Compute Transformer use trace-based JIT or graph scripting for data-dependent refinement loops, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1335", "title": "Dynamic Loop Compilation Latency", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the average latency difference between the traced max-length model and a scripted model that preserves early exits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1336", "title": "Graph Tracing Control Flow OOM", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What happens when the traced Transformer compiled at seq_len 128 receives seq_len 1024, and how much VRAM does it require?", "chain_ids": ["cloud-chain-auto-023-06"], "chain_positions": {"cloud-chain-auto-023-06": 0}, "chain_tiers": {"cloud-chain-auto-023-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1337", "title": "Evaluating Graph Tracing Failures in Dynamic Routing Models", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did graph tracing remove the MoE safety fallback in production, and what compilation strategy should replace it?", "chain_ids": ["cloud-chain-auto-023-06"], "chain_positions": {"cloud-chain-auto-023-06": 2}, "chain_tiers": {"cloud-chain-auto-023-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1338", "title": "Silent Failures in Traced Dynamic Control Flow", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do sequence length 256 requests silently fail and execute rapidly after tracing with a [16, 64] dummy input, and how is it fixed?", "chain_ids": ["cloud-chain-auto-023-06"], "chain_positions": {"cloud-chain-auto-023-06": 1}, "chain_tiers": {"cloud-chain-auto-023-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1339", "title": "Group DRO Weight Update Calculation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using exponentiated Group DRO weights with eta_q=1.0, what is the new normalized weight for Group 3?", "chain_ids": ["cloud-chain-auto-023-02"], "chain_positions": {"cloud-chain-auto-023-02": 0}, "chain_tiers": {"cloud-chain-auto-023-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1340", "title": "Debugging Group DRO Training Instability", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this instability?", "chain_ids": ["cloud-chain-auto-023-02"], "chain_positions": {"cloud-chain-auto-023-02": 1}, "chain_tiers": {"cloud-chain-auto-023-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1341", "title": "Evaluating Group DRO for Rare Medical Subgroups", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is Group DRO feasible with global batch size 256 for a 1% subgroup, and what system changes are needed for stable convergence?", "chain_ids": ["cloud-chain-auto-023-02"], "chain_positions": {"cloud-chain-auto-023-02": 2}, "chain_tiers": {"cloud-chain-auto-023-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1343", "title": "Diagnosing GQA Memory Exhaustion", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the model OOM during the forward pass despite the KV cache fitting in memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1344", "title": "Diagnosing High Latency in gRPC Tensor Transfers", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does gRPC/Protobuf still take 8ms for the 2MB tensor transfer, and how would you reduce it toward the network bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1348", "title": "Evaluating gRPC Migration for Inter-Service Tensor Transfer", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will standard gRPC achieve the SLA, and what architectural adjustments might be required for optimal tensor transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1349", "title": "Diagnosing LLM Latency Spikes with KV Cache Offloading", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing TPOT to jump above 800ms with low GPU utilization and saturated PCIe traffic?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 1}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1351", "title": "Profiling Microsecond Kernel Power Draw", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the true average power consumption of the 800 µs kernel, and how much higher is it than the 400 W NVML estimate?", "chain_ids": ["cloud-chain-auto-015-04"], "chain_positions": {"cloud-chain-auto-015-04": 0}, "chain_tiers": {"cloud-chain-auto-015-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1352", "title": "Diagnosing Power Discrepancies in Sub-second Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 100ms nvidia-smi samples report 300W while the PDU shows much higher node power for 15ms inference bursts?", "chain_ids": ["cloud-chain-auto-015-04"], "chain_positions": {"cloud-chain-auto-015-04": 1}, "chain_tiers": {"cloud-chain-auto-015-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1353", "title": "Evaluating Discrepancies in GPU Power Measurement Techniques", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which measurement should you trust for microsecond GPU energy, and what should drive power-aware scheduling for the 8-H100 cluster?", "chain_ids": ["cloud-chain-auto-015-04"], "chain_positions": {"cloud-chain-auto-015-04": 2}, "chain_tiers": {"cloud-chain-auto-015-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1355", "title": "Triple Modular Redundancy Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the probability that the 3-GPU TMR system produces an incorrect majority output in a 30-day window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1357", "title": "Evaluating Dynamic Hardware Precision Scaling", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you rely on the accelerator's dynamic precision unit or use static mixed-precision quantization for 70B LLM serving, and why?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1358", "title": "Tensor Core Vocabulary Padding", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this severe underutilization, and how should the system be redesigned to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1359", "title": "Evaluating GPU Redundancy Trade-offs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use DMR or TMR for the 500-GPU medical inference service, given a 100ms SLA and 40ms inference time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1360", "title": "Tensor Core Dimension Alignment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you deploy Proposal B with 1000/30000 dimensions, and how do Tensor Core alignment constraints affect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1361", "title": "Hardware Trojan Power Side-Channel Detection", "topic": "extreme-quantization", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many traces must you average so the 3-sigma noise bound is no more than the 5 mW Trojan signal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1362", "title": "FlashAttention SRAM Tiling Calculation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum symmetric block size B fits Q, K, V, and O in the 128 KB SRAM budget?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1363", "title": "Diagnosing Deterministic Bit-Flips in Custom AI ASICs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the likely root cause of deterministic logit corruption with rare token IDs and a correlated 50 mW power spike?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1364", "title": "Evaluating Hardware Trojan Mitigations", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which mitigation—logic locking, split manufacturing, or boot-time fingerprinting—best fits a 50,000-GPU synchronous LLM training fleet, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1365", "title": "Diagnosing Low SM Utilization in LLM Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the GPU at <5% SM utilization for batch-1 decoding, and why won't a cuBLAS upgrade materially improve tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1367", "title": "Hedged Requests Tail Math", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much extra QPS do P90 hedged requests add, and what is the new probability a request exceeds 65ms given single request P(>65ms) = 0.5% and P(>50ms) = 1%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1368", "title": "Debugging a Hedged Request Retry Storm", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did static 15ms hedging crash the embedding store at 15,000 QPS, and what structural fix prevents the cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1369", "title": "HBM3 vs HBM2e for LLM Serving Infrastructure", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do the two nodes trade off TTFT, decode throughput, and synchronization for low-latency 70B serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1372", "title": "Evaluating Hedged Request Thresholds for Feature Store", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you determine the optimal delay threshold for dispatching the secondary request, and what is the quantitative impact on the storage backend cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1373", "title": "Edge-Cloud Hierarchical Bandwidth Filtering", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much data does each robot upload per day with 12 ten-second safety events per hour, and how much does that save versus continuous streaming?", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 0}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1376", "title": "Three-Tier Retail Video Analytics", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you partition compute across the Cortex-M4 cameras, Jetson edge server, and cloud to meet all network constraints while maximizing camera battery life?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1377", "title": "Cascading Filter Failure in Hierarchical Pipelines", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does latency spike to 8 seconds during shift changes while cloud GPU utilization drops below 40%?\n", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 1}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1378", "title": "Debugging GPU Cluster Utilization Deadlock", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are all 512 GPUs allocated but idle, and what scheduler change would prevent this deadlock?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1379", "title": "Evaluating Deadlock Resolution in Fleet Orchestration", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you re-enable strict gang scheduling or use timeout-based preemption to maximize goodput on the 1,024-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1380", "title": "Diagnosing Memory and Latency Blowup in FHE Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FHE CNN take 45 minutes and 120GB per image, and what model change would make it practical?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1381", "title": "End-to-End Latency of HE Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total upload-plus-compute latency for one CKKS-encrypted inference request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1382", "title": "Evaluating FHE for Cloud-Based CNN Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a CKKS FHE architecture meet the 10-second ResNet-50 encrypted X-ray SLA, and what alternative would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1383", "title": "HPCC Rate Adjustment with INT", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the flow's new transmission rate using HPCC to fully utilize the link while draining the queue in exactly one RTT.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1384", "title": "Diagnosing Host DRAM Starvation in Vision DataLoader Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the host DRAM staging strategy failing, and how should the data pipeline be structurally analyzed and fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1385", "title": "Sizing Host DRAM for Decoded Tensor Staging", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much host DRAM is needed for four decoded global batches, and what sustained PCIe bandwidth is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1386", "title": "Evaluating Host DRAM Staging vs GPU DALI Pipelines", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you quantitatively evaluate whether to scale up Host DRAM/CPUs or bypass Host DRAM to decode/augment directly on GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1387", "title": "Evaluating HPCC vs DCQCN in 400GbE AI Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you deploy HPCC with INT or tune DCQCN for the 4096-GPU RoCEv2 cluster, and what are the main trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1388", "title": "Impact of I/O Jitter on Distributed Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What effective I/O wait time per step should you expect when 1024 shard reads gate each synchronous step?", "chain_ids": ["cloud-chain-auto-003-18"], "chain_positions": {"cloud-chain-auto-003-18": 0}, "chain_tiers": {"cloud-chain-auto-003-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1389", "title": "Diagnosing Distributed I/O Jitter in Synchronous Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you quantify synchronous I/O jitter across 256 GPUs, and how would you mitigate it without over-provisioning storage?", "chain_ids": ["cloud-chain-auto-003-18"], "chain_positions": {"cloud-chain-auto-003-18": 1}, "chain_tiers": {"cloud-chain-auto-003-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1390", "title": "Mitigating Storage I/O Jitter in Checkpointing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade the PFS or implement two-stage async checkpointing for the 2.1TB state, and why?\n", "chain_ids": ["cloud-chain-auto-003-18"], "chain_positions": {"cloud-chain-auto-003-18": 2}, "chain_tiers": {"cloud-chain-auto-003-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1391", "title": "Diagnosing the I/O Wall in Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage bandwidth does the 8-GPU ViT training loop require, and will a 2GB/s NAS cause an I/O wall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1392", "title": "Mitigating the Multimodal I/O Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade to a Parallel File System (Lustre) or implement local NVMe WebDataset caching, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1393", "title": "Medical 3D CNN I/O Starvation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the low 45% GPU utilization on the 8x H100 node, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1394", "title": "Root-Causing HPCC Incast Drops in 400G Networks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is HPCC failing to prevent spine buffer drops during the 256-node All-to-All incast, and how should you tune it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1395", "title": "Debugging Intermittent Duplicate Features in Spot-Backed Pipelines", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause and resolve this issue without disabling Spot instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1396", "title": "Evaluating Idempotency in LLM Data Pipelines", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which resilience design should you choose for the 50 TB/day PII redaction pipeline, and why?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1397", "title": "Evaluating im2col Memory Inflation in High-Res CNNs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For this first 3x3 convolution on 2048x2048 FP16 images, should you use im2col+GEMM, Winograd, or implicit GEMM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1398", "title": "Diagnosing Hidden Memory Costs in Convolution", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What transformation causes the OOM in this convolution layer, and how should it be implemented instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1399", "title": "im2col Memory Expansion Overhead", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What im2col intermediate memory footprint does this FP8 3x3 convolution create, and why can it OOM a 16GB T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1400", "title": "Immutable Audit Trail Storage Scaling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should you design the audit logging path, and what write throughput and 7-year WORM storage are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1401", "title": "Calculating Storage Bloat from Non-Idempotent Appends", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much 30-day storage bloat and deduplication shuffle does non-idempotent append create, and what write strategy avoids it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1402", "title": "Evaluating High-Throughput Immutable Audit Trails", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which audit architecture meets the <50ms inference SLA and 7-year WORM retention at 50,000 requests per second?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1403", "title": "Imperative Scheduling Topology Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the 64-GPU job stay Pending even though 100 GPUs are idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1404", "title": "Resolving Imperative Scheduling Deadlocks on GPU Nodes", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the schedulable capacity of each p4d node, and why does the strict pod request remain Pending?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1405", "title": "Parameter Server Incast Buffer Overflow Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What happens at the ToR switch when all 1,024 workers simultaneously send 2MB gradients to one parameter server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1407", "title": "Diagnosing Network Incast in Distributed Training Synchronization", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What network pattern is causing All-Reduce tail latency despite only 30% average link utilization, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1408", "title": "Evaluating Incast Mitigation in Large-Scale GPU Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you rely on PFC and deep buffers or restructure AllReduce communication to mitigate the 31-to-1 incast, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1409", "title": "CNN vs MLP Parameter Scaling", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the first-layer parameter counts of the MLP and CNN compare, and why does the CNN's inductive bias help on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1410", "title": "Analyzing ViT Generalization in Data-Constrained Cloud Environments", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the ViT-Base overfitting the 50,000 X-ray dataset while ResNet-50 generalized better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1411", "title": "Evaluating Imperative vs Declarative Scheduling for Mixed Workloads", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose between Slurm-style imperative scheduling and declarative gang scheduling to improve 65% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1412", "title": "Evaluating Inductive Bias in Data-Constrained Satellite Imagery", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the satellite anomaly pipeline use the 2B-parameter ViT or the 50M-parameter CNN given only 200,000 labels, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1413", "title": "Dynamic Batching Latency Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum safe `max_batch_delay` you can set with a batch size of 32 to maximize throughput without violating the 50ms P99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1414", "title": "Debugging End-to-End Visual Search Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does P90 API latency spike to 120ms at 150 QPS despite 8ms TensorRT inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1415", "title": "Multi-Stage Inference Pipeline Bottleneck Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum FPS for synchronous versus asynchronous execution, and how many preprocessing threads are required to saturate the T4 GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1416", "title": "Calculating Influence Functions for Model Debugging", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is exact inverse Hessian computation feasible for the 50M-parameter ResNet, and what approximation should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1417", "title": "Continuous vs Dynamic Batching Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should this LLM server use request-level dynamic batching or iteration-level continuous batching to meet TTFT and TBT SLOs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1418", "title": "Redesigning an End-to-End Vision Inference Pipeline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How can you redesign this pipeline to meet the 100ms SLA, and what are the quantitative trade-offs regarding batching and GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1419", "title": "Evaluating Influence Functions for ViT Attribution", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is iHVP feasible for a 300M-parameter ViT-L/16, and what approximations make influence functions tractable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1420", "title": "Diagnosing Dynamic Batching Latency Spikes", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does P99 latency hit 450ms with max_batch_size=128 and max_queue_delay=50ms despite only 55% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1421", "title": "Diagnosing Influence Function OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the RoBERTa influence-function job OOM before one training-data evaluation step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1422", "title": "Diagnosing Input Stationary Bottlenecks in MLP Workloads", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Input Stationary dataflow bottleneck MLP layers, and how do you quantify the architectural mismatch?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 2}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1423", "title": "Input Stationary Dataflow Memory Math", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many DRAM reads do Input Stationary and Weight Stationary dataflows require for X and W on this 128x128 PE array?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1424", "title": "Architectural Trade-offs in Data Integration", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator option better minimizes TCO and tail latency for DLRM inference, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1425", "title": "Diagnosing HBM-to-SRAM Integration Bottlenecks in Attention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the HBM-SRAM integration bottleneck in standard attention on H100, and what optimization keeps Tensor Cores fed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1426", "title": "Quantifying the Integration Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For batch-1 decoding of the 7B FP16 model, what are the time and energy ratios of weight movement versus compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1428", "title": "Diagnosing Attribution Collapse in Interleaved RecSys", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the discrepancy between the A/B test and the interleaving results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1429", "title": "Mitigating P99 Jitter with Interrupt Shielding", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do NIC interrupts affect CUDA kernel submission latency at 50,000 RPS, and what CPU/IRQ configuration shields inference cores?", "chain_ids": ["cloud-chain-auto-011-03"], "chain_positions": {"cloud-chain-auto-011-03": 0}, "chain_tiers": {"cloud-chain-auto-011-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1430", "title": "Diagnosing P99 Inference Jitter from NIC Interrupts", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you root-cause and resolve this latency jitter?", "chain_ids": ["cloud-chain-auto-011-03"], "chain_positions": {"cloud-chain-auto-011-03": 1}, "chain_tiers": {"cloud-chain-auto-011-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1431", "title": "Interrupt Shielding vs NIC Coalescing for P99 Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use strict CPU core isolation or NIC interrupt coalescing to stabilize P99 latency under the 20ms SLO, and why?", "chain_ids": ["cloud-chain-auto-011-03"], "chain_positions": {"cloud-chain-auto-011-03": 2}, "chain_tiers": {"cloud-chain-auto-011-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1433", "title": "Diagnosing Hidden Bias in Cloud KYC Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do single-axis FRR metrics mask failures for darker-skinned females, and what maximum FRR could that subgroup have?", "chain_ids": ["cloud-chain-auto-003-16"], "chain_positions": {"cloud-chain-auto-003-16": 1}, "chain_tiers": {"cloud-chain-auto-003-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1434", "title": "Evaluating Interleaving vs A/B Testing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you use interleaving or A/B testing for Model B under a strict 150ms P99 SLA, and how would you manage the tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1435", "title": "Invariance Testing Compute Cost", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What daily compute time does the 10-permutation invariance suite add, and what is the application-level violation rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1436", "title": "Root-Causing Variance in Resume Screening", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do name swaps and redaction still change the resume model's score, and how would you analyze and fix the pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1437", "title": "Intersectional Bias Mitigation in Identity APIs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What remediation would you choose for the 12.5% FRR intersectional gap given a 3-month data delay and 280ms MoE latency?", "chain_ids": ["cloud-chain-auto-003-16"], "chain_positions": {"cloud-chain-auto-003-16": 2}, "chain_tiers": {"cloud-chain-auto-003-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1438", "title": "Evaluating Invariance Testing Architectures at Scale", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use dynamic LLM counterfactuals or a static precomputed perturbation dataset for the 100,000-applicant CI/CD tests, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1439", "title": "Calculating IO Overhead in Standard Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much HBM I/O time does standard attention spend materializing S and P, and what does FlashAttention-2 save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1440", "title": "Evaluating IO-Aware Attention on A100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will FlashAttention-2 speed up training at sequence length 4096 despite 15% more FLOPs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1441", "title": "Diagnosing Activation Function Memory Bottlenecks", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FP16 activation take ~500 µs on an A100 despite using under 1 GFLOP, and how should it be optimized?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1442", "title": "Applying the Iron Law of ML Systems", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which investment, A, B, or C, gives the largest absolute end-to-end latency reduction under the Iron Law?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1443", "title": "LLM Inference Bottleneck Analysis", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why will pruning 50% of linear-layer FLOPs fail to significantly reduce batch-1 token latency for the 7B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1444", "title": "Calculating Single-Token Latency on A100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected single-token latency, and what is the primary Iron Law bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1445", "title": "Evaluating Topologies via the Iron Law", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which cluster option is more cost-effective for the 70B training run, and what MFU would Option B need to break even?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1446", "title": "Analyzing True MFU vs GPU Utilization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the actual bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1447", "title": "Shift-Left Constraint Validation Savings", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "You propose a 'shift-left' validation step that tests constraints on a randomly initialized dummy model before training begins, taking just 0.5 hours. How many A100 GPU-hours does this save per successfully deployed model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1448", "title": "Diagnosing High Iteration Tax in RTB Models", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the staging failure iteration tax, and what structural pipeline change would prevent 15ms latency failures after training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1449", "title": "Evaluating Iteration Tax in LLM Deployment", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the cost of discovering the KV-cache issue after 4 weeks, and what shift-left workflow prevents a full retraining cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1450", "title": "Calculating JSD for Disjoint Distributions", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does KL divergence crash for P=[1,0] and Q=[0,1], and what is the exact base-2 Jensen-Shannon Divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1451", "title": "Diagnosing Infinite Drift Alerts in KL Divergence", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze this failure mode and architect a more robust drift detection thresholding mechanism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1452", "title": "Drift Detection Thresholds in High-Throughput Recommendation Systems", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should automated retraining use KL divergence or JS divergence for hard drift thresholds on 256-dimensional embeddings, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1453", "title": "JIT Recompilation Spikes in Dynamic Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you root-cause and resolve these P99 latency spikes associated with JIT compilation?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 2}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1454", "title": "Quantifying Element-wise Kernel Fusion Speedup", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical speedup would a fused custom CUDA kernel achieve for this 1GB FP16 activation layer, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1455", "title": "Calculating JIT Kernel Fusion Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After JIT warmup, what is the new inference latency and total latency reduction from eliminating dispatch and fusing kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1456", "title": "Root-Causing Memory-Bound Activations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this bottleneck, and what is the structural cause within the framework?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1457", "title": "Mitigating Dispatch Bottlenecks with CUDA Graphs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total latency for 400 small MLP kernels with 5 µs launch overhead, and how would you remove the dispatch bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1458", "title": "Evaluating JIT Compilation for Dynamic Shapes", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you use JIT compilation for 10-to-512-token dynamic queries to meet the 5ms P99 SLA without 500ms recompilation spikes?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 3}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1459", "title": "Evaluating Fusion Strategies for Custom Layers", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which option would you choose for the 50MB memory-bound operation chain, and how do memory traffic, latency, and engineering cost compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1460", "title": "Diagnosing CPU Bottlenecks in Batch-1 Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does batch-size-1 serving miss the 30ms SLA despite high batch-32 utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1461", "title": "Calculating KL Divergence for Feature Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is D_KL(P||Q) for the age-group distributions, and should the severity-1 alert fire?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 0}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1462", "title": "Diagnosing Silent Accuracy Drops using KL Divergence", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What does the spike in KL(Production||Training) to 1.2 nats imply, and how would you diagnose the CTR drop?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 1}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1463", "title": "Memory Overhead of Teacher Logits in LLM Distillation", "topic": "knowledge-distillation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much GPU memory is needed to materialize the teacher's FP32 logits for one 16×2048 micro-batch over a 100,000-token vocabulary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1464", "title": "Evaluating KL Divergence for High-Throughput Drift Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is pure KL divergence robust enough for the 864M-sample daily drift trigger, and how would you handle unseen production bins?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 2}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1465", "title": "Debugging Online KD Bottlenecks", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this inefficiency, and how should the distillation system be redesigned?", "chain_ids": ["cloud-chain-auto-secondary-015-29"], "chain_positions": {"cloud-chain-auto-secondary-015-29": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1466", "title": "Evaluating Distillation Trade-offs for Cloud LLM Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 70B-to-7B summarizer use logit distillation, intermediate-state distillation, or both, given the capacity gap and temperature trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-015-32"], "chain_positions": {"cloud-chain-auto-secondary-015-32": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1467", "title": "CUDA Graphs vs Fusion for Inference", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would CUDA Graphs or a custom fused Triton kernel better address the dynamic batch-1 attention bottleneck, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1468", "title": "Evaluating KV Cache Memory Constraints and PagedAttention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can static KV-cache allocation support batch 128 at 2048 tokens on 4×80GB A100s, and what serving architecture should replace it?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 4}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1469", "title": "Calculate Maximum Batch Size for LLM KV Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum concurrent batch size can the 80 GiB GPU serve before KV-cache OOM at 2048 tokens?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 2}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1470", "title": "Diagnosing LLM Serving OOMs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What exact KV-cache memory bottleneck causes OOM for 128 concurrent 4096-token Llama-2-70B requests on 320GB VRAM?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 3}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1471", "title": "KV Cache Checkpoint Bandwidth", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bandwidth is required to checkpoint 50 full 16,000-token KV caches every 5 seconds, and is it viable on 100Gbps NICs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1472", "title": "Diagnosing Network Bottlenecks in KV Cache Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do full-KV checkpoints cause 30-second failover delays and 100% network egress at 64K-token contexts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1473", "title": "Stateful Serving Fault Tolerance", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which checkpointing strategy and frequency best meet the 2.0s RTO for 64K-token Llama-3-70B sessions over 400Gbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1474", "title": "Calculating INT8 KV Cache Memory Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the FP16 KV cache memory for 64 requests at 4096 tokens, and will INT8 KV cache quantization fit the 7B model on one 80GB GPU?", "chain_ids": ["cloud-chain-auto-008-15"], "chain_positions": {"cloud-chain-auto-008-15": 0}, "chain_tiers": {"cloud-chain-auto-008-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1475", "title": "Evaluating KV Cache Quantization for Long-Context RAG", "topic": "compound-ai-systems", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much KV-cache memory does FP16 vs INT8 require at batch 64 and 32K context, and what strategy preserves long-context retrieval accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1476", "title": "Quantifying Label Quality Drift in Moderation Pipelines", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the additional monthly cost caused by label drift, and what should you implement to detect and prevent noisy labels from poisoning retraining?", "chain_ids": ["cloud-chain-auto-secondary-015-25"], "chain_positions": {"cloud-chain-auto-secondary-015-25": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1477", "title": "Diagnosing Output Degradation in INT8 KV Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this failure?", "chain_ids": ["cloud-chain-auto-008-15"], "chain_positions": {"cloud-chain-auto-008-15": 1}, "chain_tiers": {"cloud-chain-auto-008-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1478", "title": "Mitigating Annotation Degradation in Moderation Pipelines", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which labeling strategy should you choose under the $10,000/day budget to address label quality drift: (1) k=3 consensus, (2) 5% in-house sample, or (3) Confident Learning?", "chain_ids": ["cloud-chain-auto-secondary-015-25"], "chain_positions": {"cloud-chain-auto-secondary-015-25": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1479", "title": "Diagnosing Label Shift in E-Commerce Content Moderation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What label-shift correction should you apply when the spam prior jumps from 5% to 40% while P(Y|X) stays accurate?", "chain_ids": ["cloud-chain-auto-003-09"], "chain_positions": {"cloud-chain-auto-003-09": 1}, "chain_tiers": {"cloud-chain-auto-003-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1480", "title": "Real-Time Fraud Label Shift Adaptation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the estimated true fraud rate when 27.5% of transactions are flagged, and how should you adjust the model outputs?", "chain_ids": ["cloud-chain-auto-003-09"], "chain_positions": {"cloud-chain-auto-003-09": 0}, "chain_tiers": {"cloud-chain-auto-003-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1481", "title": "Diagnosing Annotation Degradation in Data Pipelines", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically root-cause the F1 drop, and which metrics would prove label quality drift?", "chain_ids": ["cloud-chain-auto-secondary-015-25"], "chain_positions": {"cloud-chain-auto-secondary-015-25": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1483", "title": "Diagnosing P99 Latency Spikes in Triton", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 200ms+ P99 spikes despite 45% GPU utilization, and how would you fix them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1484", "title": "Calculating Maximum Allowable Latency Jitter for RTB", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum allowable jitter under the 40ms SLA, and how much must the current 45ms jitter be reduced?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1485", "title": "Mitigating Tail Latency Jitter in Real-time LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which is more cost-effective for cutting OS-induced P99 jitter below 250ms: CPU pinning and NUMA isolation or DPDK/RDMA, and how would you justify this quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1486", "title": "Layer Normalization Memory Cost", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the naive LayerNorm memory traffic and theoretical time, and how do they compare with a fused implementation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1487", "title": "Evaluating Compute vs. Network Hardware Upgrades", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you upgrade the 512-GPU cluster to H100s while keeping 200 Gbps InfiniBand, and what does distributed efficiency predict?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1488", "title": "Diagnosing Distributed Training Scaling Failure", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the 5x compute upgrade to 8x A100 nodes increase 30B training throughput by only 38%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1489", "title": "Applying the Law of Distributed Efficiency", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Ring All-Reduce communication time, and how does halving compute time affect the communication-to-computation ratio and speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1490", "title": "Diagnosing LLM Data Exfiltration Across Layers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did TLS, IAM, and a WAF fail to stop patient-record extraction, and what ML-specific defenses are missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1491", "title": "Evaluating Normalization at Micro-Batch Scale", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does BatchNorm break down with micro-batch size 2 across 128 GPUs, and what normalization should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1492", "title": "Latency Overhead in Layered Defense", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the new end-to-end latency with TEE, input sanitization, and DP noise, and does it meet the 125ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1493", "title": "LLM Layered Defense Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 8B guardrail run on the H100s or an L4 node, considering TTFT and H100 KV-cache utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1494", "title": "Root-Causing Throughput Collapse via Exposed Trace APIs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing throughput to collapse from 4,000 req/sec to 150 req/sec, and how should the Triton ingress be secured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1495", "title": "Exploiting Debug Headers in Shared LLM Caches", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What information leak do the debug headers and cross-tenant prefix cache create, and how should you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1496", "title": "Triton Debug Port Exposure Risks", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the security risks of exposing Triton metrics and profiler ports to 10,000 engineers, and how should observability be secured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1497", "title": "LayerNorm Memory Bandwidth Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze this bottleneck and what implementation resolves it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1498", "title": "Mitigating the Learnability Gap in Massive MLPs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should the team double the MLP to 1B parameters, or use structural inductive biases to close the learnability gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1499", "title": "Evaluating the Learnability Gap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why would a 50B flat MLP fail on satellite images despite higher capacity, and why choose the 5B ViT instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1501", "title": "Trans-Pacific ML Fraud Detection Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a centralized Virginia fraud service meet a 50ms SLA for users in Sydney when model inference takes 10ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1502", "title": "Hardware Refresh Carbon ROI", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the hydro-powered A100 cluster be replaced with 128 H100s now, considering 2-year lifecycle carbon accounting?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 2}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1503", "title": "Hardware Upgrade Carbon Breakeven", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which option minimizes total carbon over 3 years: 1,000 new H100s or 2,500 repurposed V100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1504", "title": "Datacenter Carbon Lifecycle Assessment", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total lifecycle carbon per node in Region B, and what percentage comes from embodied carbon?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1505", "title": "Diagnosing Global Inference Latency Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can TensorRT optimization make Tokyo-to-us-east-1 inference meet the 50ms SLA, or what architectural change is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1506", "title": "LLM Fleet Capacity via Little's Law", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum total KV cache memory required to sustain 2,000 RPS at 1.5s latency with 50MB per request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1507", "title": "Diagnosing Throughput Collapse in LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using Little's Law, what concurrency is the 8xH100 LLM service hitting at 30 RPS and 5s latency, and what bottleneck explains the plateau?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1508", "title": "Global Real-Time Inference and the Light Barrier", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a centralized us-east-1 fraud service meet a 100ms P99 SLA for Tokyo and Sydney traffic with 20ms H100 inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1509", "title": "L4 vs L7 Load Balancing for gRPC Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does P99 latency spike above 100ms at 75% utilization under L4 round-robin, and what load balancing strategy should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1510", "title": "Evaluating Cluster Concurrency Limits", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a 200-request global concurrency cap meet 500 RPS at 2.5s average latency, and what concurrency should be provisioned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1511", "title": "LLM Fleet Tail Latency Debugging", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 latency hit 9s at 40% average GPU utilization, and what L7 routing policy should replace round-robin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1512", "title": "L7 Load Balancing for Speech AI", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What minimum ingress bandwidth is needed for 2,000 audio streams at 250 KB/s, and what routing algorithm minimizes P99 latency?", "chain_ids": ["cloud-chain-auto-secondary-013-30"], "chain_positions": {"cloud-chain-auto-secondary-013-30": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1513", "title": "Diagnosing P99 Spikes in LLM Fleets", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 TTFT exceed 3s at only 60% GPU utilization under round-robin, and what routing signal should the load balancer use?", "chain_ids": ["cloud-chain-auto-secondary-013-30"], "chain_positions": {"cloud-chain-auto-secondary-013-30": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1514", "title": "Disaggregated Load Balancing for LLMs", "topic": "load-balancing", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you migrate to disaggregated prefill/decode serving or overprovision the cluster, and why?", "chain_ids": ["cloud-chain-auto-secondary-013-30"], "chain_positions": {"cloud-chain-auto-secondary-013-30": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1515", "title": "Training Time Estimation with Local NVMe Caching", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long does 10-epoch training take with a local NVMe warm cache versus reading the 64TB dataset from S3 every epoch?", "chain_ids": ["cloud-chain-auto-003-11"], "chain_positions": {"cloud-chain-auto-003-11": 0}, "chain_tiers": {"cloud-chain-auto-003-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1516", "title": "Debugging Remote Storage Bottlenecks in Multi-Node Training", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is bottlenecking the job, and how can local NVMe caching raise GPU utilization above 80% after epoch 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1517", "title": "Local NVMe Caching for Multi-Epoch Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use node-local NVMe warm caching, a Redis-style distributed cache, or Lustre for the 15TB S3 vision dataset, and why?", "chain_ids": ["cloud-chain-auto-003-11"], "chain_positions": {"cloud-chain-auto-003-11": 2}, "chain_tiers": {"cloud-chain-auto-003-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1518", "title": "Diagnosing Low NVMe Cache Hits in Batch Inference", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are NVMe cache hit rates below 5% in the daily Kubernetes jobs, and how should scheduling be changed?", "chain_ids": ["cloud-chain-auto-003-11"], "chain_positions": {"cloud-chain-auto-003-11": 1}, "chain_tiers": {"cloud-chain-auto-003-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1519", "title": "Strict vs Relaxed Locality-Aware Scheduling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the cluster use strict cache-local scheduling or relaxed scheduling with remote cache reads, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1520", "title": "Delay Scheduling Break-even Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum wait time T (in seconds) the scheduler should tolerate before falling back to scheduling the job on an idle node and pulling the dataset over the network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1521", "title": "Sizing an M/G/c/K LLM Inference Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the server utilization p and average end-to-end latency for the service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1522", "title": "M/G/c/K Queue Sizing for LLM Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the M/G/c/K 70B API gateway, should K be set to 16 or 64 to balance drops against P99 wait time, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1523", "title": "Analyzing Catastrophic Forgetting in LLM Unlearning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did gradient-ascent unlearning collapse MMLU from 72% to 47%, and what loss term is missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1524", "title": "Analyzing P99 Latency in M/G/c/K Queues", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do p99 latency and drops violate the 2-second SLA despite 75% utilization in the LLM inference queue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1525", "title": "Evaluating LLM Unlearning Strategies", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which unlearning approach should you use for the 100M-token GDPR deletion on the 7B LLM, and how would you validate compliance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1526", "title": "SISA Exact Unlearning Cost", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the minimum GPU-hour cost to exactly unlearn records A, B, and C from the SISA ensemble?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1527", "title": "Evaluating GPU Instances for High-Batch Dense FFNs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which instance do you select to maximize throughput, and how do you justify this via the roofline model?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 4}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1529", "title": "Diagnosing Attention vs FFN Bottlenecks", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the MHA batched GEMMs show lower SM utilization than FFN layers at sequence length 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1530", "title": "Analyzing FFN Layer Bottlenecks on A100 GPUs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the FFN's arithmetic intensity during batch-1 autoregressive generation, and what execution time should the A100 achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1531", "title": "Evaluating GEMM Tiling Strategies on A100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the proposed 256x256 FP16 GEMM tile viable, and what tile size should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1532", "title": "Diagnosing MMD Monitoring Bottlenecks at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does exact RBF-kernel MMD OOM and time out for the 1M-reference, 18M-live drift job, and what scalable method should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1533", "title": "MMD Drift Detection Compute Constraint", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much compute does naive quadratic MMD need for N=M=20,000, does it fit 100 GFLOPs, and what N=M stays within budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1534", "title": "Evaluating MMD Drift Detection Scale", "topic": "compound-ai-systems", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can exact nightly MMD between 100,000 reference embeddings and 10^8 daily queries finish on one A100 in 1 hour, and what should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1535", "title": "Diagnosing GEMM Tensor Core Underutilization", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the M=1023 GEMM achieve 45 TFLOPs while the M=1024 GEMM exceeds 200 TFLOPs in FP16 PyTorch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1536", "title": "Mitigating MIA in Clinical APIs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which MIA mitigation should you choose for the 15B clinical model—DP-SGD, Top-1 truncation, or rate limiting—and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1537", "title": "Evaluating Membership Inference Attack Advantage", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the MIA TPR, FPR, and Attack Advantage, and what do they imply for leakage and mitigation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1538", "title": "Attention Matrix Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the FP16 memory footprint of the B=16, H=16, S=4096 attention score matrix, and will it fit in the 40MB L2 cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1539", "title": "Diagnosing Sequence Length OOM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does increasing sequence length to 8192 cause forward-pass OOM under ZeRO-1, and what attention implementation fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1540", "title": "Activation Checkpointing vs CPU Offloading", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you bridge the 40GB gap using Activation Checkpointing or CPU Offloading over PCIe Gen4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1541", "title": "Evaluating L40S vs H100 for LLM Decoding", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For batch-1 decoding of a 70B FP16 chatbot, should you choose 8x L40S or 4x H100 SXM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1542", "title": "Diagnosing Low Compute Utilization in Autoregressive Generation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this bottleneck and what should be the actual optimization strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1543", "title": "LLM Inference Memory Bandwidth Calculation: Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum theoretical batch-1 autoregressive generation rate for the 13B FP16 model?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 1}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1544", "title": "Calculate Autoregressive Token Generation Throughput", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the maximum theoretical token rates for the 7B FP16 model on the A10G at batch size 1 and batch size 4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1545", "title": "Evaluating GPU Upgrades for LLM Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will H100s give the expected 3.1x speedup for batch-1 Llama-2 70B decoding, and what speedup should leadership expect?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 3}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1546", "title": "LLM Serving Bottleneck Evaluation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 70B decoding on 8x A100s with a 50ms TBT target, should you prioritize INT8 matmul or continuous batching with PagedAttention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1547", "title": "Roofline Analysis of LLM Decoding on A100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What arithmetic intensity and throughput should you expect from upgrading to 600 TFLOPS but the same 2.0 TB/s bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1549", "title": "Diagnosing Low Utilization in 100B DLRM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing low SM utilization and high P99 latency in the DLRM embedding workload, and what limits it?", "chain_ids": ["cloud-chain-auto-012-09"], "chain_positions": {"cloud-chain-auto-012-09": 1}, "chain_tiers": {"cloud-chain-auto-012-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1550", "title": "Sizing GPU Clusters for DLRM Embeddings", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many A100 80GB GPUs are minimally required to serve the 10B-entry DLRM embeddings plus the 50GB overhead?", "chain_ids": ["cloud-chain-auto-012-09"], "chain_positions": {"cloud-chain-auto-012-09": 0}, "chain_tiers": {"cloud-chain-auto-012-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1551", "title": "Evaluate Memory Locking vs Swap Disabling for P99 Latency", "topic": "compound-ai-systems", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you disable swap or use mlockall for the 20GB embedding service on the 32GB VM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1552", "title": "Mitigating Latency Jitter with Memory Locking", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What latency penalty does 10MB of swapped-out scattered weights add to inference, and what application-level fix prevents it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1553", "title": "DLRM Latency Spikes Post-Traffic Lull", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 800ms P99 burst-latency spikes on the 35GB DLRM VM, and what system-level fix should you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1554", "title": "DLRM Tiered Memory Architecture", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you shard the 800B-parameter DLRM purely in GPU HBM or use HBM plus host DDR5 for embeddings, and why?", "chain_ids": ["cloud-chain-auto-012-09"], "chain_positions": {"cloud-chain-auto-012-09": 2}, "chain_tiers": {"cloud-chain-auto-012-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1555", "title": "Calculating Baseline Training Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the absolute minimum baseline memory footprint required per accelerator just to store the weights, gradients, and optimizer states?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1556", "title": "Static Memory Planning Buffer Reuse", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What peak activation memory does static liveness-based planning require, and how much memory is saved compared to naive allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1557", "title": "Diagnosing p99 Latency Spikes from Dynamic Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can graph compilation and memory planning eliminate this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1558", "title": "Evaluating Static Memory Planning vs Dynamic Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you choose between dynamic allocation, static max-shape planning, and bucketed static planning for serving this variable-length 13B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1559", "title": "Diagnosing OOM in 7B LLM Full Fine-Tuning", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does full fine-tuning the 7B FP16 model OOM on an 80GB GPU even with batch size 1, and what should you do?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1560", "title": "LLM Decoding on A100 Roofline", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What arithmetic intensity does the A100 need, how does batch-1 decoding compare, and what serving change fixes the gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1561", "title": "Small File Metadata Overhead in Object Storage", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What throughput limit does 20ms per-file S3 latency impose, and how should you restructure the 10M-image dataset to fix it?", "chain_ids": ["cloud-chain-auto-011-13"], "chain_positions": {"cloud-chain-auto-011-13": 0}, "chain_tiers": {"cloud-chain-auto-011-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1562", "title": "Diagnosing Sublinear Inference Scaling on H100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did H100 batch-1 LLM decoding improve only ~1.6x instead of >3x, and how can you escape that bottleneck?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 2}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1563", "title": "Diagnosing Small File Metadata Starvation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the A100s starved despite 20GB/s storage bandwidth, and how should the 10M JPEGs be stored instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1564", "title": "Prometheus Cardinality Explosion", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you redesign the observability stack to support tenant-level billing without crashing the metrics server?", "chain_ids": ["cloud-chain-auto-004-16"], "chain_positions": {"cloud-chain-auto-004-16": 0}, "chain_tiers": {"cloud-chain-auto-004-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1565", "title": "Evaluating Storage Formats for Small-File Datasets", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should 100M 50KB image chips be stored as individual JPEGs for 10k images/s training, and what architecture should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1566", "title": "Diagnosing Observability OOM Cascades", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the pods running out of memory, and how do you fix it?", "chain_ids": ["cloud-chain-auto-004-16"], "chain_positions": {"cloud-chain-auto-004-16": 1}, "chain_tiers": {"cloud-chain-auto-004-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1567", "title": "Managing High Metric Cardinality in Global Model Serving", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate the metric design and what tradeoffs must you make to stabilize the TSDB without losing visibility?", "chain_ids": ["cloud-chain-auto-004-16"], "chain_positions": {"cloud-chain-auto-004-16": 2}, "chain_tiers": {"cloud-chain-auto-004-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1569", "title": "Evaluating Unified Network Fabrics for Mice and Elephant Flows", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 400Gbps Clos fabric use flow-level DLB or cell-based packet spraying to protect LLM training from web-service mice flows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1570", "title": "Optimizing Arithmetic Intensity with Mini-Batching", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum mini-batch size makes the 4096x4096 FP16 layer compute-bound on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1571", "title": "Diagnosing p99 Latency Spikes in Mixed-Flow Networks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does p99 inference latency spike during 5GB ingestion bursts even though ToR utilization averages only 40%, and how should you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1572", "title": "Estimating Mixed-Precision Memory Footprint", "topic": "mixed-precision-training", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the actual static memory footprint for 10B parameters under standard AMP with Adam, and can it fit on one GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1573", "title": "Diagnosing Mixed-Precision OOM Failures", "topic": "mixed-precision-training", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does FP16 mixed-precision AdamW for a 7B model OOM before the first forward pass under DDP, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-01"], "chain_positions": {"cloud-chain-auto-secondary-015-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1574", "title": "Diagnosing A100 Underutilization in NLP Training", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the root cause of this underutilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1575", "title": "Evaluating FP16 vs BF16 for LLM Pre-training Stability", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 30B LLM training run switch from FP16 with loss scaling to BF16, and what trade-offs justify the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1576", "title": "Evaluating Mini-Batch Size on A100", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is batch size 16 a good systems choice for the MLP, and what batch size would better match the hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1577", "title": "MoE Active Parameter Bandwidth", "topic": "mixture-of-experts", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming FP16 precision and a batch size of 1, how much GPU memory is needed for weights and how much memory bandwidth is consumed to decode a single token?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 1}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1578", "title": "Diagnosing MoE Routing and OOM Bottlenecks", "topic": "mixture-of-experts", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do specific GPUs OOM and tail latency spike under expert parallelism, and how should you fix the 8x7B MoE serving path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1579", "title": "Evaluating MoE Serving Topologies on H100 Nodes", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which parallelization topology should you use for the 8x22B MoE on 8x GPUs, and why is it better than pure TP or pipeline parallelism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1580", "title": "Architecting a Multi-Tier Storage Hierarchy for H100 Clusters", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 5PB image pipeline use a centralized all-flash file system or local NVMe caching to reach 250GB/s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1581", "title": "Diagnosing NVMe Cache Thrashing in CV Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did adding 4TB NVMe caches per node barely improve utilization, and what data access pattern should replace global random shuffling?", "chain_ids": ["cloud-chain-auto-secondary-007-01"], "chain_positions": {"cloud-chain-auto-secondary-007-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1582", "title": "Multi-Tier Pre-fetch Sizing for 3D ViT", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the node's data ingestion rate, can it stream directly from 5GB/s S3, and how long to prefetch a 2TB epoch to NVMe?", "chain_ids": ["cloud-chain-auto-secondary-007-01"], "chain_positions": {"cloud-chain-auto-secondary-007-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1583", "title": "Production-Weighted Model Quality Benchmarking", "topic": "extreme-quantization", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the INT4 model's production-weighted accuracy, and what RAG accuracy is needed to keep degradation under 2% relative to the baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1584", "title": "MLPerf Time-to-Train Cost Estimation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming linear scaling, what are the estimated training time and cost for each vendor, and which instance is more cost-effective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1585", "title": "Evaluating FP8 Trade-offs in MLPerf Time-to-Train", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt FP8 for the MLPerf LLM submission given 1.2s steps but 70,000 convergence steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1586", "title": "Diagnosing Activation Outliers in INT8 Quantized LLMs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did W8A8 quantization look fine on MMLU but fail on long-context support queries, and what is the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1588", "title": "Large Batch MLPerf Convergence Analysis", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did MLPerf Time to Train regress after doubling the batch size despite higher hardware throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1589", "title": "Root-Causing Deployment Failures via Disaggregated Evaluation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you root-cause this systemic failure, and what specific statistical artifacts were missing from the Model Card?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1590", "title": "Benchmarking Quantization Degradation in Generative LLMs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did offline quantization benchmarks miss the code-completion regression, and how should you benchmark INT8 model quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1591", "title": "40B Model Cold-Start Latency Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum theoretical cold-start latency to load the 40B FP16 model into HBM over the 100Gbps network and PCIe Gen5?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1592", "title": "Diagnosing LLM Cold-Start Loading Bottlenecks", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What structural bottlenecks cause the 4.5-minute cold start for the 140GB LLM, and which loading steps dominate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1593", "title": "Trade-offs in Disaggregated Performance vs Latency", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you serve the medical triage LLMs to meet the <200ms TTFT SLA while satisfying cost and subgroup fairness constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1594", "title": "Evaluating Storage Backends for LLM Cold-Starts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which storage backend would you choose to load 350GB of FP16 weights within a strict 45-second cold-start SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1595", "title": "Dynamic Ensemble Compute vs Memory", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-request TFLOPS and active VRAM requirements for soft routing (all 4 outputs) versus top-1 hard routing across the 4 experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1596", "title": "Diagnosing Dynamic Ensemble Latency on A10G", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 685ms P99 latency with only 20% GPU utilization, and how would you fix it on 24GB instances?", "chain_ids": ["cloud-chain-auto-secondary-005-08"], "chain_positions": {"cloud-chain-auto-secondary-005-08": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1597", "title": "Calculating MFU for LLM Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Model FLOPS Utilization for this 20B LLM training run on 256 GPUs with a 12.0-second step time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1598", "title": "Evaluating Dynamic Ensembles for Fraud Detection", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which serving architecture should you choose for the 3 specialized 13B fraud models to meet <100ms P99 at 10,000 QPS, and why?", "chain_ids": ["cloud-chain-auto-secondary-005-08"], "chain_positions": {"cloud-chain-auto-secondary-005-08": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1600", "title": "Diagnosing TensorRT Dynamic Shape Spikes", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the TensorRT engine spike to 200ms and OOM on sequence lengths 32–512, and how should the dynamic shapes be configured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1602", "title": "Black-Box Model Inversion Query Cost", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many queries, how much time, and how much cost would 100 steps require, and what API output mitigation would disrupt the attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1603", "title": "Evaluating TensorRT vs ONNX for Dynamic Shapes", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you optimize this dynamic-shape BERT-Large workload with ONNX Runtime or TensorRT, and how would you minimize P99 latency jitter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1604", "title": "Evaluating MFU for LLM Training on A100 Cluster", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate the current MFU to justify this decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1605", "title": "Cloud API Model Inversion Vulnerability", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What API design flaw enables the 50,000-query face reconstruction attack, and how would you mitigate it without retraining?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1606", "title": "Diagnosing PII Leakage in Fine-Tuned LLMs", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did exact-string deduplication fail to prevent PII memorization, and what pipeline changes would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1607", "title": "Mitigating PII Memorization in Clinical LLMs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate three distinct mitigations (aggressive corpus deduplication/scrubbing, Differential Privacy training (DP-SGD), and post-training RLHF refusal guardrails) and justify your primary architectural choice quantitatively based on compute budget, utility degradation, and verifiable privacy guarantees?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1608", "title": "Diagnosing Inter-Node TP Bottlenecks", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you place tensor and pipeline parallelism across the two nodes to eliminate the cross-node all-reduce bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1609", "title": "Calculating Minimum Model Parallelism Degree", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum tensor-parallel degree fits the 30B mixed-precision Adam training state on 40GB GPUs if ZeRO/FSDP and PP are disabled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1610", "title": "Mitigating Black-Box Model Inversion on Health APIs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defense should you choose to stop the 50,000-query inversion attack while keeping latency under 50ms and accuracy loss under 1.5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1611", "title": "Evaluating TP and PP Placement for 175B LLMs", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you place Tensor Parallelism and Pipeline Parallelism across the nodes to maximize MFU for the 175B FP16 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1612", "title": "LLM Quantization Trade-offs on A100", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you deploy the 70B LLM as FP16 across 2 GPUs or INT4 on 1 GPU, and why for decoding throughput and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1613", "title": "GPU Sizing for 70B Model Sharding", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many A10G GPUs are required to tensor-shard the 70B FP16 LLM with 20 GiB total KV cache and activation memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1614", "title": "LLM Inference Throughput with INT8 Quantization", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum token generation rates for the 30B model on one GPU using FP16 versus INT8 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1615", "title": "Diagnosing Asymmetric OOM in Tensor Parallel Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 8-way tensor-parallel 70B model OOM only on GPU 0 during batch-128, sequence-2048 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1616", "title": "LLM Inference Sharding Strategy", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use TP=8, PP=8, or a hybrid to shard the 175B FP16 model on one 8x A100 node for <200ms TTFT, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1617", "title": "Quantization Speedup Failure at Large Batch", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does W8A8 quantization improve token throughput by less than 5% at batch size 512 despite double the theoretical INT8 peak TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1618", "title": "Verifying LLM Output Watermarks via Z-Score", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What Z-score does 260 green tokens out of 400 produce with γ=0.5, and does it exceed the legal threshold of Z=4.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1619", "title": "Evaluating LLM Watermark Sequence Requirements", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the watermark undetectable on 20-token responses, and what minimum sequence length is needed to reach $Z \\ge 4$?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1620", "title": "MIG Profile Packing and Resource Fragmentation", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many `1g.5gb` MIG profiles can Tenant C get after the `4g.20gb` and `2g.10gb` allocations, and what resource limits further allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1621", "title": "Batched PRNG Collisions in Logit Watermarking", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the watermark z-score collapse from >6 at batch size 1 to ~0.1 under continuous batching with B=64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1622", "title": "Diagnosing Throughput Drop Migrating from MPS to MIG", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did moving from MPS to MIG cause this specific performance degradation, and what is the root cause bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1623", "title": "Global Gateway Rollback Impact Calculation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many requests experience degraded latency during the 45-second routing propagation and 60-second linear drain rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1624", "title": "MIG Compute Partitioning vs. Latency SLAs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is partitioning the A100 into 7x 1g.10gb MIG profiles feasible for seven 8GB services with a 50ms P99 SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1625", "title": "Artifact Sync Race Condition in Multi-Region Canary", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused eu-central ModelNotFound errors during the 5% canary rollout, and how would you redesign the deployment protocol?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1626", "title": "Global Large Model Rollout and Rollback Evaluation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you distribute and roll back 60GB model weights across 3 regions to guarantee a sub-60-second global rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1627", "title": "Diagnosing Intersectional Calibration Failures", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically diagnose and correct localized miscalibrations without manually auditing all permutations or overfitting on small slices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1628", "title": "A/B Testing FWER Correction", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many false discoveries are expected across 20 models and 4 metrics at alpha=0.05, and what Bonferroni alpha controls FWER at 0.05?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1629", "title": "Diagnosing A/B Test False Positives", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 18 seemingly significant model promotions degrade revenue despite positive A/B dashboards across 50 variants and 10 metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1630", "title": "Evaluating Multicalibration in Risk Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use iterative multicalibration post-processing or a subgroup MoE for the clinical model under a 50ms P99 budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1633", "title": "Theoretical Minimum MatMul Latency", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum execution time for the FFN up-projection Y=XW on this GPU at 312 TFLOPs FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1634", "title": "Diagnosing Tail Latency in RESTful Image Serving", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 200ms p99 latency and low GPU utilization, and how would you fix the request path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1635", "title": "Dynamic Batching Network Latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the max theoretical throughput and end-to-end latency for a request that waits the full 10ms batch timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1636", "title": "MAC Utilization in Memory-Bound Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator, A or B, gives lower batch-1 generation latency per token, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1637", "title": "NPU Thermal Constraints and Edge Compute Limits", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum sustainable frame rate can the CPU and NPU each achieve within the 4W ML thermal envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1638", "title": "Analyzing Edge NPU Graph Compiler Fallbacks", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of the 120ms edge latency spike after replacing ReLU with Swish, despite dropping NPU utilization?", "chain_ids": ["cloud-chain-auto-secondary-015-10"], "chain_positions": {"cloud-chain-auto-secondary-015-10": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1639", "title": "Diagnosing Low MAC Throughput in Custom Kernels", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FP16 custom CUDA kernel top out at about 8 TFLOPS instead of using the T4's 65 TFLOPS Tensor Cores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1640", "title": "Evaluating Networked Serving vs Embedded Deployment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you extract the image embedding model into a Triton microservice, how many T4 GPUs are needed at 400 QPS, and what max batch delay meets 50ms p99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1641", "title": "Cloud-to-Edge NPU Offloading Architecture", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you move the 1,000-camera video analytics workload to 4 TOPS edge NPUs, and what compute/TCO trade-offs drive the decision?", "chain_ids": ["cloud-chain-auto-secondary-015-10"], "chain_positions": {"cloud-chain-auto-secondary-015-10": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-10": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1642", "title": "Dual-Socket NUMA Dataloader Bandwidth", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much cross-socket bandwidth do GPUs 4-7 need if staging memory is accidentally allocated on Socket 0, and how would you fix the NUMA issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1643", "title": "Diagnosing GPU Starvation via Cross-Socket Memory Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is starving the A100 GPUs despite 40 GB/s storage bandwidth, and how would you fix the NUMA placement?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1644", "title": "Evaluating NUMA-Aware Data Loading Bottlenecks", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes GPUs 4-7 to run at 30% lower utilization, and how should you redesign data-loader CPU and memory affinity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1645", "title": "Diagnosing High Tail Latency in Dual-Socket CPU Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you root-cause and resolve this scaling bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-017-20"], "chain_positions": {"cloud-chain-auto-secondary-017-20": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1646", "title": "Optimizing Dual-Socket Memory Bandwidth", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What effective memory bandwidth does the shared Node-0 model get, and how should you reconfigure the workers for NUMA locality?", "chain_ids": ["cloud-chain-auto-secondary-017-20"], "chain_positions": {"cloud-chain-auto-secondary-017-20": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1647", "title": "Evaluating TP and PP Placement Across NVLink", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where should tensor parallelism and pipeline parallelism be placed across H100 nodes to minimize communication bottlenecks, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1648", "title": "NVLink 4.0 Activation Transfer Time", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum time to transfer the 45GB activation from GPU 0 to GPU 1 over NVLink 4.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1649", "title": "Direct S3 Streaming Bandwidth for A100 Clusters", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What network bandwidth and how many concurrent HTTP byte-range requests are required per 8-GPU node to stream the 1MB TFRecords?", "chain_ids": ["cloud-chain-auto-secondary-015-12"], "chain_positions": {"cloud-chain-auto-secondary-015-12": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1650", "title": "NVLink to PCIe Fallback", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What likely causes the ~14x slow intra-node ncclAllGather, and how would you verify and fix the topology issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1651", "title": "Object Storage Streaming vs POSIX Systems", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 5PB training dataset be copied to Lustre or streamed directly from object storage, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-12"], "chain_positions": {"cloud-chain-auto-secondary-015-12": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1652", "title": "Red Team LLM: Adversarial Payload Architecture", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy can meet 5,000 evasive payloads/sec on the cluster, white-box surrogate or black-box RL, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1653", "title": "Diagnosing Object Storage Prefix Rate Limits in Streaming", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does S3 streaming flatline at exactly 11 GB/s despite 50 GB/s instance networking, and how should the dataset be laid out?", "chain_ids": ["cloud-chain-auto-secondary-015-12"], "chain_positions": {"cloud-chain-auto-secondary-015-12": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1655", "title": "LLM-Automated Spear Phishing Scale", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 200-token personalized phishing emails can the 50-node T4 botnet generate in 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1656", "title": "Online Learning Backlog Catch-Up Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long will it take to process the accumulated backlog and catch up to the live stream once restored?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1657", "title": "Diagnosing OOM in DLRM Online Learning", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing VRAM to grow from 10GB to 24GB over 48 hours despite constant event volume and cleared computation graphs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1658", "title": "Continuous Adaptation for DLRM under Drift", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should online learning update the full 101GB DLRM or only the 1GB dense MLP while embeddings update daily, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1659", "title": "Debugging Block-wise Softmax for Long-Context Attention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and fix the tile-based softmax computation to match exact attention?", "chain_ids": ["cloud-chain-auto-008-13"], "chain_positions": {"cloud-chain-auto-008-13": 1}, "chain_tiers": {"cloud-chain-auto-008-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1661", "title": "Diagnosing ONNX Runtime Graph Fragmentation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose and resolve this performance bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1662", "title": "Evaluating Online Softmax for Long-Context Kernels", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is block-based online softmax strictly required for 65,536-token attention to avoid the memory wall compared to HBM-materialized or naive 1-pass softmax?", "chain_ids": ["cloud-chain-auto-008-13"], "chain_positions": {"cloud-chain-auto-008-13": 2}, "chain_tiers": {"cloud-chain-auto-008-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1663", "title": "Online Softmax Memory Footprint Calculation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much HBM is needed for the 65,536 x 65,536 FP16 attention matrix, and how much SRAM for one 128-query block's online-softmax stats?", "chain_ids": ["cloud-chain-auto-008-13"], "chain_positions": {"cloud-chain-auto-008-13": 0}, "chain_tiers": {"cloud-chain-auto-008-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1665", "title": "Diagnosing Carbon Footprint Spikes in Hardware Upgrades", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did operational carbon increase after moving the 10-day A100 job to the faster H100 Midwest cluster, and by how much?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1666", "title": "Estimating LLM Training Operational Carbon", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total operational carbon emissions will the 128-node training run produce over 30 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1667", "title": "Output Stationary SRAM Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact reduction in psum SRAM accesses achieved by an Output Stationary dataflow compared to reading and writing the psum for every MAC operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1668", "title": "LLM Training Region Carbon Footprint Evaluation", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which region must you choose to stay under the 50-ton CO2 budget, and what is the financial trade-off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1669", "title": "Diagnosing Write-Bound Systolic Arrays", "topic": "extreme-quantization", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the architectural root cause of this inverted memory profile, and how should you reconfigure the dataflow to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1670", "title": "Calculating Paged KV Cache Capacity", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many concurrent requests can PagedAttention support with 512-token average sequences versus static 2048-token KV allocation?", "chain_ids": ["cloud-chain-auto-019-02"], "chain_positions": {"cloud-chain-auto-019-02": 0}, "chain_tiers": {"cloud-chain-auto-019-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1671", "title": "Dataflow Trade-offs for Cloud Accelerators", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the accelerator use Output Stationary or Weight Stationary dataflow for layers with many partial sums, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1672", "title": "Diagnosing Low Batch Size in Paged KV Cache", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing OOMs in the Paged KV cache with 1024-token blocks on a mixed 150-token and 2000-token workload?", "chain_ids": ["cloud-chain-auto-019-02"], "chain_positions": {"cloud-chain-auto-019-02": 1}, "chain_tiers": {"cloud-chain-auto-019-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1673", "title": "Evaluating Paged KV Cache Block Sizes", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Paged KV cache block size, 16 or 256 tokens, should you choose for Llama-2-70B, and why?", "chain_ids": ["cloud-chain-auto-019-02"], "chain_positions": {"cloud-chain-auto-019-02": 2}, "chain_tiers": {"cloud-chain-auto-019-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1674", "title": "Paged KV Cache Capacity Calculation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected increase in concurrent request capacity with 16-token paged KV cache allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1675", "title": "Evaluating Block Sizes in Paged Memory Management", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which PagedAttention block size (1, 16, or 256 tokens) would you choose, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1676", "title": "Evaluating PagedAttention for LLM Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What impact will migrating to 16-token PagedAttention blocks have on memory utilization and maximum batch size?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1677", "title": "Calculating PAM4 Baud Rate for 800G Links", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What actual symbol rate per lane is required to support the 800G link over 8 PAM4 lanes with 6.25% overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1678", "title": "Diagnosing PagedAttention Block Size Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the original OOMs, and how should you choose a PagedAttention block size between 256 and 8 tokens?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 1}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1679", "title": "Diagnosing PAM4 FEC Errors in 400G Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the root cause of this performance degradation in the PAM4 signaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1680", "title": "KV Cache Fragmentation Bottleneck", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the OOMs above 4 requests, and what concurrency should 16-token paged KV allocation support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1681", "title": "Calculating PagedAttention Memory Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory is wasted by contiguous KV allocation, and how much is consumed with 16-token PagedAttention blocks?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 0}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1682", "title": "Identifying Dominated Models on the Pareto Frontier", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the fundamental Pareto-frontier flaw in deploying Model Z over Model X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1683", "title": "Evaluating PAM4 Transceivers for 800G AI Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt 100G PAM4 over 8 lanes for 800G ports, and what trade-offs must you budget for?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1684", "title": "Evaluating Pareto Optimal Models for Fraud Detection", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which models are Pareto-optimal, which are dominated, and how should the 100ms SLA affect the final choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1685", "title": "PCIe Gen5 Transfer Latency", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum PCIe transfer time per step for 8GB, and why will the effective time be higher?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1686", "title": "Evaluating PCIe Bottlenecks in ZeRO-Offload", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What PCIe lower bound does this ZeRO-3 offload design impose per optimizer step, and is it viable for a 5-second iteration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1687", "title": "Diagnosing PCIe Host-to-Device Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a 100ms step time physically impossible when fetching 8GB over a single PCIe Gen5 x16 link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1688", "title": "Pipeline Bubble Fraction Calculation", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum global batch size gives 10% or lower GPipe bubble overhead with p=8 and microbatch size 4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1689", "title": "Diagnosing PCIe DataLoader Bottlenecks", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the CPU bottlenecked, and how do you achieve true asynchronous transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1690", "title": "Evaluating Massive Pinned Memory Allocations", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you pin the entire 400GB embedding table in host memory, and what architecture would you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1691", "title": "Diagnosing Low Utilization in Pipeline Parallelism", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 55% SM utilization, and what exact fraction of compute time is wasted by pipeline bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1692", "title": "Minimizing Pipeline Bubble Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum valid micro-batch count keeps 1F1B idle time below 10%, and what maximum micro-batch size follows?", "chain_ids": ["cloud-chain-auto-023-16"], "chain_positions": {"cloud-chain-auto-023-16": 0}, "chain_tiers": {"cloud-chain-auto-023-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1693", "title": "Calculate DMA Transfer Speedup with Pinned Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What transfer time and bandwidth improvement should you expect after enabling pin_memory=True for 800MB batches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1694", "title": "Accelerator Pipeline Throughput Calculation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the steady-state throughput in tiles per 100,000 clock cycles, and what is the utilization of the memory load unit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1695", "title": "GPipe vs 1F1B Schedule Trade-offs in LLM Training", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use flush-based GPipe or 1F1B for 512 microbatches, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1696", "title": "Diagnosing High Pipeline Bubble Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you change the microbatching to reduce PP bubble overhead and exceed 85% utilization without changing global batch size?", "chain_ids": ["cloud-chain-auto-023-16"], "chain_positions": {"cloud-chain-auto-023-16": 1}, "chain_tiers": {"cloud-chain-auto-023-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1697", "title": "Optimizing Pipeline Bubble in 175B Model Training", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you increase to 256 microbatches or use interleaved 1F1B with v=4 to cut the PP bubble below 10%, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1698", "title": "Diagnosing CPU-GPU Inference Pipeline Bottlenecks", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can you restructure the CPU preprocessing, H2D transfer, and GPU compute pipeline to double throughput on the T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1699", "title": "Evaluate Pipelining vs Batch Scaling", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which design maximizes throughput: CUDA-stream pipelining at batch 64 or sequential batch 128, and what throughput does it achieve?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1700", "title": "Evaluating Drift Metrics for High-Volume Credit Scoring", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use K-S tests or PSI to monitor drift for 10M daily categorical and binned features, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1701", "title": "Calculating Feature Drift with Population Stability Index", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the PSI for the income-bracket shift from 80/20 to 50/50, and does it exceed the 0.2 drift threshold?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1702", "title": "Compute Overhead of KernelSHAP Explanations", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many dedicated CPU cores are required to generate KernelSHAP explanations for 10% of 500 requests/sec without queuing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1703", "title": "Diagnosing KernelSHAP Latency Bottlenecks", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottleneck is causing the KernelSHAP timeouts, and how should you reduce latency below 1 second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1705", "title": "4-Bit PTQ Memory Footprint for LLMs", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 4-bit AWQ weight footprint including metadata, and will the 30B model fit on a 24GB VRAM GPU?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1706", "title": "Evaluating SHAP vs LIME for Real-Time Fraud", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which explainer—KernelSHAP, TreeSHAP, or LIME—should you use to meet the 50ms inference-plus-explainability budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1707", "title": "Debugging INT8 PTQ Degradation in 70B LLMs", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the W8A8 PTQ perplexity collapse, and what targeted quantization fix would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1708", "title": "Evaluating PTQ Strategies for 70B LLM Serving", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use W8A8 or W4A16 PTQ to fit the 70B LLM on one 80GB A100 for high-throughput serving, and why?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1709", "title": "Cloud RAPL Power Side-Channel Analysis", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you mitigate software power side-channel attacks against the co-hosted LLM service?", "chain_ids": ["cloud-chain-auto-015-05"], "chain_positions": {"cloud-chain-auto-015-05": 0}, "chain_tiers": {"cloud-chain-auto-015-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1710", "title": "Diagnosing Multi-Tenant GPU Power Side-Channel Leaks", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the MIG tenant's architecture leak through NVML telemetry, and how would you mitigate it?", "chain_ids": ["cloud-chain-auto-015-05"], "chain_positions": {"cloud-chain-auto-015-05": 1}, "chain_tiers": {"cloud-chain-auto-015-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1711", "title": "Liquid Cooling Trade-offs at the Power Density Wall", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you pay the $15M DLC premium or deploy one node per air-cooled rack, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1712", "title": "Rack Power Density Capacity Calculation", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 42U racks are needed for 16 DGX H100 nodes under a 24kW rack limit, and what space utilization results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1713", "title": "Diagnosing PDU Trips in High-Density GPU Racks", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 15 kW PDU trip despite the average draw being only 9 kW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1714", "title": "Mitigating Power Analysis in Multi-Tenant GPU Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you mitigate 1ms GPU power-telemetry leakage, and how do dummy work, power caps, and SEV-SNP compare?", "chain_ids": ["cloud-chain-auto-015-05"], "chain_positions": {"cloud-chain-auto-015-05": 2}, "chain_tiers": {"cloud-chain-auto-015-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1715", "title": "Calculating Total Facility Energy for Training Cluster", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many MWh will the 1,024-GPU cluster consume at the facility over the 30-day peak-utilization run?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 2}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1716", "title": "Data Center PUE vs Grid Carbon Intensity", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which colocation site should you choose, and what are the annual cost and carbon emissions for each facility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1717", "title": "Diagnosing PUE Degradation in Liquid Cooling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the new PUE after the liquid-cooling retrofit, why does it worsen, and is the cluster actually more sustainable?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 3}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1718", "title": "Rack-Level Power Wall Calculation for AI Clusters", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the proposed 10-node rack’s peak power draw, and how many 8-GPU nodes fit under the 40 kW rack limit?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 2}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1719", "title": "LLM KV Cache Waste with Static Pre-allocation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With static 2048-token KV allocation on a 24GB RTX 3090, what is the wasted memory per full batch and maximum batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1720", "title": "Diagnosing Sustained Training Throughput Drops on H100 Nodes", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 15-20% throughput drop after 20 minutes of stable training, and how can it be mitigated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1721", "title": "Evaluating H100 Cluster Power and Cooling Topologies", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you choose sparse air cooling or dense liquid cooling for the 1,024-GPU H100 cluster, and how do power limits affect networking, TCO, and throughput?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 4}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1722", "title": "Evaluating KV Cache Pre-allocation Waste", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much KV cache memory does 4096-token pre-allocation waste per request, and why should the 13B serving engine use dynamic allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1723", "title": "LLM Decode Speedup via Weight Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical decode tokens per second do FP16, INT8, and INT4 weight formats achieve on 16 TB/s of A100 memory bandwidth?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 2}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1724", "title": "Diagnosing LLM KV Cache Pre-allocation Waste", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory bottleneck limits the 70B service to 16 concurrent requests, and how large is the KV cache utilization gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1726", "title": "Diagnosing W8A16 Quantization Regression During Prefill", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does W8A16 nearly double batch-1 decode speed but make 2048-token prefill 15% slower than FP16?", "chain_ids": ["cloud-chain-auto-014-12"], "chain_positions": {"cloud-chain-auto-014-12": 0}, "chain_tiers": {"cloud-chain-auto-014-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1727", "title": "Diagnosing Preemption Throughput Collapse", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the throughput collapse and GPU starvation when preemptively swapping 32K token KV caches over PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1728", "title": "Evaluating Swap vs Recompute in Preemptive Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For preempting 16 batch tasks averaging 4000 tokens, should you swap KV cache over PCIe Gen4 or discard and recompute it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1729", "title": "KV Cache Preemption Swap Latency", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What total PCIe latency penalty does swapping out and back the KV cache for 32 requests at 2,000 tokens incur?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1730", "title": "Calculating Optimal Prefetch Buffer Depth for I/O Jitter", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How deep must the host DRAM prefetch buffer be to cover 1.2s I/O spikes without stalling 150ms GPU steps?", "chain_ids": ["cloud-chain-auto-003-02"], "chain_positions": {"cloud-chain-auto-003-02": 0}, "chain_tiers": {"cloud-chain-auto-003-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1731", "title": "Diagnosing GPU Starvation from P99 I/O Jitter", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the GPU stalling, and how deep must the prefetch buffer be to guarantee zero starvation?", "chain_ids": ["cloud-chain-auto-003-02"], "chain_positions": {"cloud-chain-auto-003-02": 1}, "chain_tiers": {"cloud-chain-auto-003-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1732", "title": "LLM Inference Phase Bottlenecks", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the arithmetic intensities for a 2048-token prefill and single-token decode, and which hardware limit bounds each phase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1733", "title": "Evaluating Prefetch Buffer Depth", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use a 2-batch prefetch buffer or size it for the 600ms p99.9 tail, and what depth prevents GPU starvation?", "chain_ids": ["cloud-chain-auto-003-02"], "chain_positions": {"cloud-chain-auto-003-02": 2}, "chain_tiers": {"cloud-chain-auto-003-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1734", "title": "Estimating Prefix Caching Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much KV cache memory is saved when 256 requests share the same 2048-token system prompt via prefix caching?", "chain_ids": ["cloud-chain-auto-008-06"], "chain_positions": {"cloud-chain-auto-008-06": 0}, "chain_tiers": {"cloud-chain-auto-008-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1735", "title": "Evaluating Split-Pool vs Chunked Prefill", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the service use split-pool serving or chunked prefill to protect the 50ms/token decode latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1736", "title": "Diagnosing Zero-Hit Prefix Caching", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is automatic prefix caching hitting 0% with the SessionID and Date prefix, and how should the prompt be reformatted?", "chain_ids": ["cloud-chain-auto-008-06"], "chain_positions": {"cloud-chain-auto-008-06": 1}, "chain_tiers": {"cloud-chain-auto-008-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1737", "title": "Diagnosing Bottlenecks in LLM Serving", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes TTFT/TBT spikes when long prefills and decodes share continuous batching, and what serving architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1738", "title": "Prefix Caching Trade-offs in Agentic LLM Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you add global prefix caching for the 90% shared 4,000-token prompt, and when would cache overhead outweigh its benefits?", "chain_ids": ["cloud-chain-auto-008-06"], "chain_positions": {"cloud-chain-auto-008-06": 2}, "chain_tiers": {"cloud-chain-auto-008-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1739", "title": "Resolving Image Pipeline Preprocessing Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why will upgrading to a faster GPU fail to meet the 50ms latency budget, and what preprocessing fix meets it?", "chain_ids": ["cloud-chain-auto-secondary-016-03"], "chain_positions": {"cloud-chain-auto-secondary-016-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1740", "title": "Diagnosing End-to-End Latency in Image Serving", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you quantize the ResNet-50 model to INT8, or optimize the 40ms CPU preprocessing bottleneck first?", "chain_ids": ["cloud-chain-auto-secondary-016-03"], "chain_positions": {"cloud-chain-auto-secondary-016-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1741", "title": "PFC Buffer Headroom Calculation for 400GbE RoCEv2", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum PFC buffer headroom per 400GbE port is needed to absorb in-flight packets after a pause trigger?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1742", "title": "Image Preprocessing Bottleneck in GPU Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Will upgrading the GPU meet the 15ms SLO, and what architecture is more cost-effective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1743", "title": "Diagnosing RoCEv2 PFC Buffer Drops", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the RoCEv2 collapse despite PFC, and how should you tune buffers and congestion control?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1744", "title": "Diagnosing P0 Inference Starvation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the P0 job starved, and how do you resolve this at a system level?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1745", "title": "Evaluating PFC Thresholds in 400G RoCEv2", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you lower the PFC XOFF watermark from 50% to 10%, and what systemic trade-offs does that create for RoCEv2 All-to-All traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1746", "title": "Network Bandwidth Priority Inversion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What delay does $J_H$ experience while $J_L$ writes 500GB at only 10Gbps, and what scheduling anomaly causes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1747", "title": "Priority Inversion in GPU Fleet Orchestration", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you resolve the P0 autoscaler blocked by a P2 lock holder starved by P1 work: priority inheritance, killing P2, or lock timeouts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1748", "title": "Data-Parallel Process Group Misalignment in 3D Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the DP AllReduce running over all 1024 ranks instead of the 8-rank DP group, and what process-group fix is needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1749", "title": "PE Array Utilization and Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the accelerator’s peak TFLOPS, actual PE utilization, and primary bottleneck given 65.5 TFLOPS and 512 GB/s HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1750", "title": "Process Group Topology for 3D Parallelism", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should TP or DP process groups be mapped to intra-node NVLink for TP=8, PP=16, DP=8 training, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1751", "title": "Counting Data-Parallel Process Groups in a 3D Layout", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Under TP=8, PP=8, DP=8 on 512 GPUs, how many DP process groups must be initialized, and which 8 ranks belong to each — and which links does the DP AllReduce traverse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1753", "title": "Memory Footprint for Progressive VLM Deployment", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the static weight memory footprints for the 86B FP16 cloud model, 7B INT8 edge model, and 1.5B INT4 mobile model?", "chain_ids": ["cloud-chain-auto-008-17"], "chain_positions": {"cloud-chain-auto-008-17": 0}, "chain_tiers": {"cloud-chain-auto-008-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1754", "title": "Monolithic vs Multi-Core PE Arrays", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which PE array design delivers better throughput for independent batch-1 autoregressive decoding streams, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1755", "title": "Debugging Progressive Deployment Failures Across Tiers", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the edge-camera OOM crashes and the mobile tier’s 40% accuracy drop, and how would you fix them?", "chain_ids": ["cloud-chain-auto-008-17"], "chain_positions": {"cloud-chain-auto-008-17": 1}, "chain_tiers": {"cloud-chain-auto-008-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1756", "title": "Progressive Deployment Architecture Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a hardware-aware cascade across cameras, gateways, and cloud to meet the 100 ms SLA without losing recall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1757", "title": "Diagnosing OOM Failures in Long-Context Transformer Training", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 32,768-token CUDA OOM despite ZeRO-3 and checkpointing, and what exact attention change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1758", "title": "Evaluating 32K Context Scaling on A100 GPUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which of TP=8, FlashAttention-2 with checkpointing, or sparse attention should you choose to fit 32K context while preserving exact attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1759", "title": "Evaluating RoCEv2 QoS for Mixed Workloads", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you configure DSCP, ECN, PFC, and virtual lanes so inference stays under 50 ms P99 during training checkpoints?", "chain_ids": ["cloud-chain-auto-002-08"], "chain_positions": {"cloud-chain-auto-002-08": 2}, "chain_tiers": {"cloud-chain-auto-002-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1760", "title": "Diagnosing RoCEv2 Head-of-Line Blocking During Checkpoints", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do NCCL AllReduce timeouts correlate with 80 TB checkpointing despite no link oversubscription, and what network QoS fix would you apply?", "chain_ids": ["cloud-chain-auto-002-08"], "chain_positions": {"cloud-chain-auto-002-08": 1}, "chain_tiers": {"cloud-chain-auto-002-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1761", "title": "Attention Matrix Memory Calculation at 65K Context", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much FP16 memory does the 65,536-token attention probability matrix require with 32 heads, and what is the increase versus 4,096 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1762", "title": "RoCEv2 Traffic Class Allocation with DWRR", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With DWRR weights 1 for storage and 3 for training on a 400 Gbps link, what bandwidth does each class get and does training meet its 250 Gbps SLO?", "chain_ids": ["cloud-chain-auto-002-08"], "chain_positions": {"cloud-chain-auto-002-08": 0}, "chain_tiers": {"cloud-chain-auto-002-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1763", "title": "Diagnosing QAT Divergence in LLMs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT8 QAT diverge when layer norm outputs spike above 120, and what quantitative quantizer strategy would stabilize training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1764", "title": "Symmetric INT8 Scale and Error Simulation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For symmetric INT8 QAT with max magnitude 12.7, what are S, the INT8 value for activation 3.55, and the simulated quantization error?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1765", "title": "Diagnosing DP Congestion in Rail-Optimized Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused DP AllReduce traffic to saturate only IB switches 0 and 1 after restart, and what scheduling constraint fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1766", "title": "Evaluate Rail-Optimized DP Replica Scheduling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For TP=8, PP=4, DP=32 on 8 InfiniBand rails, should DP replicas use unrestricted placement or identical GPU-index pinning, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1767", "title": "Sizing Leaf Switches for Rail-Optimized Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 64-port 400 Gbps leaf switches are needed across all 8 rails for a 1:1 non-blocking network for 128 nodes?", "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 0}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1768", "title": "Rail-Optimized DP Topology Bandwidth", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For TP=1, PP=8, DP=64 on a 4:1 oversubscribed spine, what is the effective per-GPU gradient sync bandwidth for naive versus rail-optimized placement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1769", "title": "Evaluate QAT for LLM Serving", "topic": "mlops-lifecycle", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why would INT8 QAT recover accuracy where PTQ failed for the 7B model, and what serving gains should you expect over FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1770", "title": "Diagnosing Stragglers in Multi-Node TP Fabrics", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did TP=8 AllReduce latency jump 300% after replacing the rail-optimized InfiniBand fabric with a generic ECMP leaf-spine?", "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 1}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1771", "title": "Evaluating Rail-Optimized Topologies for Cross-Node TP", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For TP=16 spanning two nodes, how do standard leaf-spine and rail-optimized topologies affect cross-node TP AllReduce performance?", "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 2}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1772", "title": "Diagnosing Block Storage IOPS Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the EBS gp3 data loader cap at 400 MB/s despite 1,000 MB/s provisioned throughput, and how should the dataset be restructured?", "chain_ids": ["cloud-chain-auto-003-04"], "chain_positions": {"cloud-chain-auto-003-04": 1}, "chain_tiers": {"cloud-chain-auto-003-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1773", "title": "NVMe Random IOPS Bottleneck in Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "At 400,000 random 4 KB audio clips per second, what throughput can the NVMe SSD deliver, and does it meet the training requirement?", "chain_ids": ["cloud-chain-auto-003-04"], "chain_positions": {"cloud-chain-auto-003-04": 0}, "chain_tiers": {"cloud-chain-auto-003-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1774", "title": "Evaluating Storage Upgrades vs Data Serialization", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you buy the 500,000-IOPS Extreme NVMe tier or migrate 1B 50KB images to WebDataset/TFRecord, and why quantitatively?", "chain_ids": ["cloud-chain-auto-003-04"], "chain_positions": {"cloud-chain-auto-003-04": 2}, "chain_tiers": {"cloud-chain-auto-003-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1775", "title": "CPU Reactive Burst Scaling vs GPU Pre-provisioning", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you absorb the 5,000 QPS launch spike with fast CPU spillover or pre-provision enough GPUs, and what is the cost trade-off?", "chain_ids": ["cloud-chain-auto-001-14"], "chain_positions": {"cloud-chain-auto-001-14": 1}, "chain_tiers": {"cloud-chain-auto-001-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1776", "title": "Hybrid CPU-Spillover for Flash Spikes", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the bottleneck during the 6,000 RPS flash-sale spike, and how would you design CPU spillover to cover the 4-minute GPU warmup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1777", "title": "Absorbing Traffic Spikes with CPU Reactive Scaling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many 8-vCPU CPU instances are needed for the 2,400 QPS surge, and how does their cost per inference compare with the GPU baseline?", "chain_ids": ["cloud-chain-auto-001-14"], "chain_positions": {"cloud-chain-auto-001-14": 0}, "chain_tiers": {"cloud-chain-auto-001-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1778", "title": "RDMA vs TCP/IP Kernel Overhead Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much CPU processing time is saved by using RDMA instead of TCP/IP to transfer the 40 GB FP32 gradient tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1779", "title": "Diagnosing RDMA Fallback and Kernel Overhead", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What does 45μs NCCL latency with high ksoftirqd usage indicate, and how would you restore expected InfiniBand AllReduce throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1780", "title": "Calculating GPU Cluster Resource Fragmentation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many 4-GPU evaluation jobs can start immediately after 15 distinct 6-GPU training jobs, and how many GPUs are stranded?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1781", "title": "Evaluating RoCEv2 vs TCP for LLM Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 2,048-GPU cluster use tuned TCP/IP over 400 Gbps Ethernet or RoCEv2 for AllReduce, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1782", "title": "Multi-Dimensional Resource Fragmentation and GPU Stranding", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why are Job A pods pending despite 200 idle GPUs, and should you fix it with scheduler bin-packing, re-profiling Job B, or both?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1783", "title": "REST API Serialization Overhead", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 64 embeddings of 1024 float32 values over JSON, what is the payload size, serialization time, and primary latency bottleneck?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 1}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1784", "title": "Evaluating Public REST to gRPC Migration for Vector APIs", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you replace the public REST/JSON embedding API entirely with gRPC, or use a hybrid interface, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1785", "title": "Diagnosing Multi-Tenant GPU Resource Fragmentation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is an 8-H100, 1.5 TB fine-tuning job pending when the Kubernetes cluster shows 120 free GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1786", "title": "REST Serialization Bottleneck in Inference", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural bottleneck causes 98% CPU utilization and 15% T4 utilization for 500 JSON-encoded 1024-dim embeddings per request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1787", "title": "Diagnosing Monolithic Accelerator Fab Rejection", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would a foundry reject a 1,250 mm² monolithic 5 nm AI accelerator, and what architectural shift is required?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 2}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1788", "title": "Yield Calculation at the Reticle Limit", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using Y = e^(-AD), what are the expected yields for the 858 mm² monolithic die and each 215 mm² chiplet at 0.1 defects/cm²?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1789", "title": "Monolithic vs. Chiplet Accelerator Evaluation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Considering 25% monolithic yield, 85% chiplet yield, 20 ns interposer latency, and 40 W overhead, which design should go to mass production for training a 500B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1790", "title": "Compute Cost of Forward vs. Reverse Mode Autodiff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What wall-clock time would reverse-mode versus forward-mode AD take to compute the full gradient for one batch of the 100M-parameter model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1791", "title": "Custom Autograd vs Reverse-Linked Graph", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which attention implementation should you choose: native PyTorch autograd saving all intermediates or a custom autograd function that recomputes them?", "chain_ids": ["cloud-chain-auto-008-08"], "chain_positions": {"cloud-chain-auto-008-08": 2}, "chain_tiers": {"cloud-chain-auto-008-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1792", "title": "Diagnosing OOM in Reverse Mode Differentiation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does VRAM spike and OOM at loss.backward() after the forward pass, and how can training fit without reducing batch size?", "chain_ids": ["cloud-chain-auto-008-08"], "chain_positions": {"cloud-chain-auto-008-08": 1}, "chain_tiers": {"cloud-chain-auto-008-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1793", "title": "Diagnosing Autograd Memory Leaks in Training Loops", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does GPU memory grow by 8 GB per validation step when the loop does epoch_loss += loss, and how should it be fixed?", "chain_ids": ["cloud-chain-auto-008-07"], "chain_positions": {"cloud-chain-auto-008-07": 1}, "chain_tiers": {"cloud-chain-auto-008-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1794", "title": "Evaluating Autograd Engines: Forward vs Reverse Mode", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you switch to Forward Mode Differentiation for the 50B model, and what should you use instead to reduce activation memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1795", "title": "Ring AllReduce Network Bandwidth Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum Ring AllReduce time to synchronize 4 GB of gradients across 16 nodes on 100 Gbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1796", "title": "Validation Loop Memory Leak", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory will the retained validation graph consume over 200 batches, and what failure or fix should you expect?", "chain_ids": ["cloud-chain-auto-008-07"], "chain_positions": {"cloud-chain-auto-008-07": 0}, "chain_tiers": {"cloud-chain-auto-008-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1797", "title": "Evaluating Ring AllReduce Bottlenecks at Scale", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For synchronizing FP16 gradients of the 175B model on 64 A100s, should you use a Parameter Server or Ring AllReduce, and why?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 4}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1798", "title": "Mitigating Outliers with Robust Loss", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you change the loss to prevent FP16 overflows from 5% extreme outliers without giving up mixed precision?", "chain_ids": ["cloud-chain-auto-023-03"], "chain_positions": {"cloud-chain-auto-023-03": 0}, "chain_tiers": {"cloud-chain-auto-023-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1799", "title": "Diagnosing Gradient Explosions from Corrupted Cloud Data", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the NaN gradient crashes, and how would you modify the objective to stabilize training without filtering the 50 TB dataset?", "chain_ids": ["cloud-chain-auto-023-03"], "chain_positions": {"cloud-chain-auto-023-03": 1}, "chain_tiers": {"cloud-chain-auto-023-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1800", "title": "RoCEv2 Goodput for Small Tensor Parallel Messages", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum theoretical payload goodput for 256-byte RoCEv2 messages on the 400 Gbps link with 80 bytes of overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1801", "title": "Evaluating Robust Loss for Noisy Recommender Embeddings", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the 50B recommender use robust loss or Co-teaching to handle 8% label noise, and what are the hardware trade-offs?", "chain_ids": ["cloud-chain-auto-023-03"], "chain_positions": {"cloud-chain-auto-023-03": 2}, "chain_tiers": {"cloud-chain-auto-023-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1802", "title": "Diagnosing Flat Ring AllReduce Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the flat Ring AllReduce taking about 1.58 seconds, and what collective topology should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1803", "title": "Diagnosing RoCEv2 PFC Storms", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose this bottleneck, and what configuration changes stabilize the fabric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1804", "title": "Evaluating RoCEv2 vs InfiniBand for H100 Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which 400 Gbps fabric would you choose for the 2,048-GPU MoE cluster, considering incast, tail latency, and TCO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1805", "title": "Evaluating Centralized vs Decentralized RBAC in Distributed ML Data Lakes", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design RBAC across Redis, Iceberg/S3, and archives to satisfy compliance while meeting a 5ms P99 latency SLA for feature serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1806", "title": "RBAC Policy Binding Compression Factor", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What reduction factor in IAM policy bindings does RBAC achieve versus direct per-dataset user permissions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1807", "title": "RBAC API Rate Limiting Bottleneck in Distributed Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the dataloader I/O collapse, and how should RBAC be redesigned to avoid per-object IAM checks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1808", "title": "Roofline Model Analysis on A100", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the model's theoretical maximum throughput on the GPU, and is it compute-bound or memory-bound?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 2}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1809", "title": "Runtime Input Validation Latency Budget", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many CPU cycles per feature dimension are available for validating 1024 float32 features within the 10 microsecond latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1810", "title": "Diagnosing LLM Decode Inefficiency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the Triton compute optimization fail for token decoding, and what is the fundamental Roofline bottleneck?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 2}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1811", "title": "H100 Migration for LLM Decoding", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you migrate the low-batch 70B decoding service from A100 to H100, given 2 FLOPs/byte arithmetic intensity and 2x cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1812", "title": "Mitigating Sponge Attacks via Runtime Input Validation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which validation strategy should protect the H100 LLM fleet from sponge examples while staying under the 20ms P99 overhead budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1813", "title": "Diagnosing Image Decompression Bombs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can requests under the 5MB limit cause 14GB preprocessing spikes, and what validation should be added before decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1814", "title": "Runtime Entropy Monitoring for Adversarial Shifts", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum adversarial request rate is needed to push the 1-minute average entropy above the 1.2-bit alert threshold?", "chain_ids": ["cloud-chain-auto-secondary-017-15"], "chain_positions": {"cloud-chain-auto-secondary-017-15": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1815", "title": "Diagnosing Runaway Generation from Prompt Injections", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using only runtime output monitoring metrics, how do you diagnose this root cause and differentiate it from normal heavy usage?", "chain_ids": ["cloud-chain-auto-secondary-017-15"], "chain_positions": {"cloud-chain-auto-secondary-017-15": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1816", "title": "Latency SLAs for Real-Time Saliency Maps", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What latency would 50-step Integrated Gradients add, and why should you use a vanilla Saliency Map to meet the 30ms SLA?", "chain_ids": ["cloud-chain-auto-011-04"], "chain_positions": {"cloud-chain-auto-011-04": 0}, "chain_tiers": {"cloud-chain-auto-011-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1817", "title": "Explainability Latency Bottleneck on T4 GPUs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling Saliency Maps spike p99 latency to 280ms and drop GPU utilization despite the expected 75ms compute cost?", "chain_ids": ["cloud-chain-auto-011-04"], "chain_positions": {"cloud-chain-auto-011-04": 1}, "chain_tiers": {"cloud-chain-auto-011-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1818", "title": "Explainability Latency Trade-offs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the medical image service use Saliency Maps or 50-step Integrated Gradients to satisfy the 150ms p99 explanation SLA?", "chain_ids": ["cloud-chain-auto-011-04"], "chain_positions": {"cloud-chain-auto-011-04": 2}, "chain_tiers": {"cloud-chain-auto-011-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1820", "title": "A100 Multi-Node Scaling Efficiency", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the two-node scaling efficiency, and how does it change the cost per 1M samples trained versus one 8-GPU node?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1821", "title": "Diagnosing Elastic Scaling Collapse", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does throughput rise only 15% when scaling from 64 to 128 GPUs with a static global batch size, and what should the scheduler do?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1822", "title": "Evaluating GPU Allocation Limits via Scaling Efficiency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate allocating the requested 128 GPUs versus capping the job at a lower GPU count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1823", "title": "Diagnosing Ingestion Bottlenecks in Synchronous Schema Validation", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the FastAPI ingestion latency spike and 503s after adding synchronous Python JSON schema validation, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-27"], "chain_positions": {"cloud-chain-auto-secondary-015-27": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1825", "title": "Evaluating Ingestion Schema Validation for High-Throughput Streams", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you split schema validation between the edge API gateway and stream processor for 500,000 JSON bid requests/sec?", "chain_ids": ["cloud-chain-auto-secondary-015-27"], "chain_positions": {"cloud-chain-auto-secondary-015-27": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1826", "title": "SecAgg Mask Generation Compute Overhead", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If a client's mobile SoC has a PRG throughput of 2.0 GB/s, how much latency does each client add generating SecAgg pairwise PRG masks for 500 clients?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1827", "title": "Confidential VM Cold Start Overheads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With pipelined network download, decryption, and PCIe transfer, how long is the 14GB model cold-start load into GPU memory?", "chain_ids": ["cloud-chain-auto-001-17"], "chain_positions": {"cloud-chain-auto-001-17": 0}, "chain_tiers": {"cloud-chain-auto-001-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1828", "title": "Hierarchical Secure Aggregation Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the global FL aggregator use one 5,000-client SecAgg group or 10 groups of 500, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1829", "title": "Capacity Reduction and Abstention", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If an attacker systematically queries the API, how many records are vulnerable before and after reducing capacity and adding abstention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1830", "title": "SecAgg Dropout Recovery Compute Explosion", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the SecAgg dropout recovery latency spike when 5% of 10,000 devices drop offline, and how should it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1831", "title": "Evaluating Confidential Computing for LLM Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you enable Confidential Computing for the 70B medical LLM, and will it keep p99 latency under 200ms/token?", "chain_ids": ["cloud-chain-auto-001-17"], "chain_positions": {"cloud-chain-auto-001-17": 2}, "chain_tiers": {"cloud-chain-auto-001-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1832", "title": "Mitigating Memorization via Architectural Capacity Constraints", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural changes would make the 13B financial summarization LLM natively resist training data extraction attacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1833", "title": "Diagnosing High TTFT in Confidential GPU Enclaves", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the TTFT regression with Confidential Computing enabled while decoding throughput remains normal?", "chain_ids": ["cloud-chain-auto-001-17"], "chain_positions": {"cloud-chain-auto-001-17": 1}, "chain_tiers": {"cloud-chain-auto-001-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1834", "title": "Architectural Abstention in Cloud LLMs", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do these choices impact susceptibility to data extraction while maintaining TTFT under 300ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1835", "title": "SMPC Communication Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total communication time for one SMPC inference given 100 layers, 10 rounds per layer, 500 MB, 10 Gbps, and 5 ms RTT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1836", "title": "Diagnosing SMPC Latency Collapse", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why has SMPC throughput fallen to 0.2 QPS with low CPU and bandwidth usage, and should you upgrade to AES-NI instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1837", "title": "SMPC vs FL for Cross-Institution DNN Training", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the consortium use SPDZ-style SMPC or federated learning with differential privacy for the 100M-parameter model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1838", "title": "SimCLR Contrastive Collapse on A100s", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the SimCLR plateau at 45% linear-probe accuracy and 15% GPU memory utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1839", "title": "Masked Autoencoder Pre-training Compute Optimization", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many ExaFLOPs per epoch does MAE save versus full-patch training with 75% masking, 30 TFLOPs baseline, 2 TFLOPs decoder, and 1.2M images?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1840", "title": "LSTM Sequential Latency Bound on A100 GPUs", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical minimum latency for one 512-token LSTM sequence, and will increasing batch size to 256 reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1841", "title": "Evaluating SSL Pre-training for Edge Resilience", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you choose SimCLR or MAE to pre-train the 630M-parameter ViT-Huge on 256 A100 40GB GPUs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1842", "title": "LLM Data Ingestion with Sequential Streaming Formats", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the metadata overhead and throughput differ between fetching 100M individual 100KB JSON files versus streaming 10,000 sequential 1GB shards?", "chain_ids": ["cloud-chain-auto-secondary-015-13"], "chain_positions": {"cloud-chain-auto-secondary-015-13": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1843", "title": "Evaluating Streaming Formats for VLM Training", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you migrate the 5TB dataset to a sequential streaming format like WebDataset (TAR) or tune Parquet, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-13"], "chain_positions": {"cloud-chain-auto-secondary-015-13": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1844", "title": "Diagnosing Object Store API Bottlenecks", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 500TB multimodal pipeline limited to 1.5 GB/s despite a 100 Gbps link, and what data layout change would fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-13"], "chain_positions": {"cloud-chain-auto-secondary-015-13": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1846", "title": "Evaluating RNN Optimization Strategies on A100s", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can CUDA graphs and custom fused kernels make the 2048-step LSTM reach Transformer-level GPU utilization, or is the bottleneck algorithmic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1847", "title": "Data Pipeline Throughput Calculation for NVMe SSDs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What read throughput do 500M 4KB JSON files achieve on the NVMe SSD, does it starve the 4 GB/s GPU pipeline, and how should you store the data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1848", "title": "JSON Serialization Overhead in Batch APIs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What serialization overhead does JSON add versus Protobuf for 500 candidate 512-dim float32 embeddings, and how would latency change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1849", "title": "Evaluating Sequential Storage Patterns for Distributed Training", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are 64 GPUs starved while reading 500M individual 500KB JPEGs from Lustre, and what data layout should you use?", "chain_ids": ["cloud-chain-auto-011-13"], "chain_positions": {"cloud-chain-auto-011-13": 2}, "chain_tiers": {"cloud-chain-auto-011-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1850", "title": "Diagnosing NVMe I/O Starvation in Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the storage system failing to reach its advertised throughput, and how do you confirm the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1851", "title": "Evaluating RPC Frameworks for High-Throughput Embeddings", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the embeddings API migrate from JSON REST to gRPC/Protobuf bytes or rely on response caching to maximize GPU saturation, and why?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 3}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1852", "title": "Bottleneck Analysis in Python REST/JSON Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 latency over 35ms when ONNX inference is 5ms, and what mitigation would you propose?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 2}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1853", "title": "Evaluating 112G vs 224G SerDes Trade-offs", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 51.2T switch fabric move from 112G to 224G PAM4 SerDes given 2.5m intra-rack reach requirements, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1854", "title": "Strict SLO in Ad Recommendation Systems", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If the model's base inference latency is 12ms for a single request, and each additional request in the batch adds 1ms to the inference time, what is the maximum permissible batch window (wait time) the dynamic batcher can use if it targets a maximum batch size of 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1855", "title": "SLO Allocation in Multi-Stage Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the feed use static per-service latency budgets or dynamic leftover timeouts to meet the 250ms global P99 SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1856", "title": "Diagnosing P99 SLO Violations in Dynamic Batching", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the DLRM pipeline violate the 150ms P99 SLO, and what maximum dynamic batching timeout should it use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1857", "title": "Debugging 112G PAM4 SerDes FEC Failures", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze and resolve this high-temperature pre-FEC BER bottleneck at the switch level?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1858", "title": "Autoscaling Ramp-up and Boot Time Lag Calculation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much buffer capacity in RPS and nodes is required to avoid SLA violations during the 3-minute autoscaling boot window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1859", "title": "LLM Autoscaling and KV Cache Bottlenecks", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does TTFT exceed 4 seconds when GPU compute utilization stays below 75%, and what metric should autoscaling use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1860", "title": "Evaluating Custom Serving Engine ROI", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is a 3-month delay to build a 2x-throughput inference engine financially justified at 1,000 QPS and 1,024 output tokens?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 3}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1861", "title": "Calculating the Training vs. Inference Cost Crossover", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "In how many days will inference costs exceed the initial 30-day training cost for the 70B LLM API?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 1}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1862", "title": "Analyzing the Inference OpEx Explosion", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why has serving OpEx eclipsed the $300k training CapEx for the 30B recommender, and what architectural inefficiency should be fixed?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 2}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1863", "title": "Diagnosing SLO Violations from Batch Accumulation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 TTFT spiking above 1.6 seconds at 80 RPS with dynamic batch size 128, despite sufficient GPU throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1864", "title": "Batching Limits Under Strict Latency SLAs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum dynamic batch size meets the 60ms latency SLA, and what serving throughput does it allow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1865", "title": "LLM Serving Inversion Trade-offs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should batching and scheduling change to meet TTFT <200ms and inter-token latency <50ms for the 7B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1866", "title": "LLM Serving Autoscaling Under Burst", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you handle 3x traffic spikes within 60s given 4-minute LLM cold starts without paying for a permanent 3x warm buffer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1867", "title": "Calculating LLM Microservice Serving Tax", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the absolute serving tax and final TTFT for the Gateway-Tokenizer-Model Worker request path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1868", "title": "Diagnosing Microservice Serving Tax", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing p99 latency to bloat from 20ms GPU compute to 85ms in the JSON/REST recommender pipeline, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1869", "title": "Evaluating Microservice Serving Tax for ML Pipelines", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you split the 120ms monolithic recommender into gRPC microservices with 2MB tensor payloads under a 200ms p99 SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1870", "title": "Sizing Cloud Shadow Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many instances are needed to shadow 12,000 RPS at no more than 75% utilization?", "chain_ids": ["cloud-chain-auto-001-04"], "chain_positions": {"cloud-chain-auto-001-04": 0}, "chain_tiers": {"cloud-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1871", "title": "Diagnosing Latency Spikes in Synchronous Shadow Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did 100% shadow deployment add 25ms to user-facing P99 latency even though the shadow predictions are discarded?", "chain_ids": ["cloud-chain-auto-001-04"], "chain_positions": {"cloud-chain-auto-001-04": 1}, "chain_tiers": {"cloud-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1872", "title": "Architecting Shadow Deployments for Latency-Sensitive APIs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What risks does application-layer shadowing create at 5,000 QPS, and what production-grade alternative should you use?", "chain_ids": ["cloud-chain-auto-001-04"], "chain_positions": {"cloud-chain-auto-001-04": 2}, "chain_tiers": {"cloud-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1873", "title": "Diagnosing Object Storage Stalls in Global Shuffling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does perfect global shuffling over 100M 100KB samples from object storage stall the pipeline, and what shuffling strategy should replace it?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 1}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1874", "title": "Sizing Cloud Shard Shuffle Buffers", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What K and B would you configure to maximize randomness under 64GB RAM while saturating the 10Gbps S3 link?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 0}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1875", "title": "Evaluating Global vs. Shard-Level Shuffling for 10TB LLM Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which shuffling design would you choose for the 10TB S3 dataset on 1024 GPUs, and why?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 2}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1876", "title": "70B LLM FSDP Checkpoint Time Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the monolithic and distributed sharded checkpoint write times, and what bottlenecks each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1877", "title": "Diagnosing Sharded Checkpoint Metadata Stalls", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the >5 minute sharded checkpoint stalls on NFS, and how would you fix them?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 3}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1878", "title": "SDC Rate and Optimal Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the system-level SDC MTBF on 8,192 GPUs, and what checkpoint interval is optimal with 6-minute checkpoints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1879", "title": "Root-Causing Deterministic Loss Divergence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically diagnose the root cause of the deterministic loss divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1880", "title": "Mitigating Silent Data Corruption in LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the trade-offs and select an SDC mitigation strategy to avoid poisoned checkpoints without drastically reducing Goodput.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1881", "title": "Evaluating Sharded Checkpointing for 70B LLMs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you migrate from global to distributed sharded checkpointing, and what trade-offs arise when resuming on a different cluster size?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1882", "title": "Evaluating Silent Degradation in Embedding Fleets", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the trade-offs to catch this silent degradation within 5 minutes and <2% latency overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1884", "title": "Early-Exit Bypass with Synthetic Benchmark Data", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why was the BERT-base benchmark invalid, and how should CI benchmark throughput without masking quality regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1885", "title": "Evaluating Proxy Metrics for Delayed-Label Drift Detection", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which proxy metric strategy would detect the new synthetic fraud vector within 24 hours, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1886", "title": "Fraud Detection Silent Failure Investigation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What root causes could explain the complaint spike despite healthy infrastructure, and how would you detect them systematically?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1887", "title": "Silicon Interposer Edge Density Calculation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total memory bandwidth and minimum compute-die edge length needed to interface with all 6 HBM3 stacks?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 0}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1888", "title": "Evaluating 2.5D Silicon Interposer Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which packaging option should you choose for memory-bound trillion-parameter LLM inference, and how does it affect TCO?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 2}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1889", "title": "Root-Causing 2.5D Packaging HBM Failures", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What packaging-level physical issue is causing the edge HBM stacks to fail memory training?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 1}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1890", "title": "SIMT Latency Hiding via Little's Law", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many active threads per SM are required to saturate 2.0 TB/s HBM bandwidth and hide 400 ns memory latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1891", "title": "Analyzing SIMT Register Pressure and Memory Stalls", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is this A100 kernel latency-bound at only 30% HBM bandwidth, and what would you change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1892", "title": "SIMT Warp Mapping for Sparse Attention", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which sparse attention mapping strategy better leverages the SIMT execution model to hide the ~300 cycle global memory latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1893", "title": "CPU Overhead Calculation for SR-IOV Network Bypass", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many host CPU cores per node does SR-IOV save by bypassing the vSwitch for 4x400Gbps traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1895", "title": "JIT Compiler SSA Graph Memory Footprint Calculation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the naive memory footprint of the h state variables in the unrolled SSA graph before liveness or buffer reuse?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 1}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1896", "title": "Root-Causing 400GbE Bottlenecks in Virtualized GPU Nodes", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural misconfiguration is causing this massive throughput collapse and forcing high host CPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1897", "title": "Skip Connection Gradient and Memory Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the first-layer gradient magnitudes with and without skips, and how much FP16 memory does one bypass tensor require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1898", "title": "Evaluating Memory Costs of Skip Connections", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why do dense skip connections cause OOM during the backward pass on 24GB GPUs, and how would you preserve gradient flow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1899", "title": "Evaluating SSA Form for JIT Compiler Optimization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you bypass SSA generation to save 15 minutes, and how would that affect liveness, fusion, and throughput?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 3}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1900", "title": "Diagnosing Gradient Collapse in Deep CNNs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did removing bypass paths stall the 150-layer 3D CNN, and how would you restore trainability within the memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1901", "title": "Diagnosing Metadata Server Collapse on Lustre", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing low GPU utilization and 100% Lustre MDS CPU, and how should the dataset be stored instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1902", "title": "Evaluating Storage Bottlenecks for Millions of Tiny Images", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option should the team choose to fix 15% GPU utilization, and why is it better than storage hardware upgrades?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1903", "title": "Metadata IOPS Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What IOPS and bandwidth are required, what bottlenecks utilization, and how much would 100MB WebDataset shards reduce IOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1904", "title": "Debugging SSA Compiler OOM", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does SSA generation for the unrolled generation loop blow up CPU memory, and what structural fix avoids it?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 2}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1905", "title": "Balancing W8A8 Outliers with SmoothQuant", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What SmoothQuant scale s_j balances activation max 121 and weight max 1, and what are the resulting maxima?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 0}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1906", "title": "DLRM Embedding Lookup Bandwidth Calculation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the memory read volume per batch, and what is the theoretical bandwidth-bound maximum batches per second for the lookup stage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1907", "title": "Evaluating SmoothQuant for 175B LLM Serving", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which INT8 activation quantization approach would you choose to restore accuracy while maintaining dense Tensor Core throughput, and why?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 2}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1908", "title": "Diagnosing DLRM Embedding Lookup Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why doesn't increasing the batch size fix the SM utilization, and what is the fundamental root cause of this performance ceiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1909", "title": "Diagnosing Accuracy Collapse in INT8 LLM Deployments", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze and resolve this bottleneck without incurring the massive latency overhead of dynamic per-channel activation quantization?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 1}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1910", "title": "DLRM Embedding Sparse Scatter Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What data transfer rate and bandwidth utilization do 10 million 128-byte sparse accesses produce, and what bottleneck causes it?", "chain_ids": ["cloud-chain-auto-008-10"], "chain_positions": {"cloud-chain-auto-008-10": 0}, "chain_tiers": {"cloud-chain-auto-008-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1911", "title": "Evaluating Embedding Sharding in DLRMs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you parallelize the 400GB embeddings and 2GB dense layers across 8 A100s to minimize step latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1912", "title": "Diagnosing Low Utilization in DLRM Lookups", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the pipeline stalling, and what specific architectural constraint is causing this severe underutilization?", "chain_ids": ["cloud-chain-auto-008-10"], "chain_positions": {"cloud-chain-auto-008-10": 1}, "chain_tiers": {"cloud-chain-auto-008-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1913", "title": "Evaluating DLRM Sparse Embedding Placement", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 500GB DLRM embeddings live entirely in CPU DRAM or use a hybrid GPU-HBM cache, and why?", "chain_ids": ["cloud-chain-auto-008-10"], "chain_positions": {"cloud-chain-auto-008-10": 2}, "chain_tiers": {"cloud-chain-auto-008-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1914", "title": "Diagnosing I/O Bottlenecks in Stochastic Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "System profiling shows high CPU I/O wait times and near-zero page cache hit rates. You are using standard PyTorch Datasets reading individual files with `shuffle=True`. What is causing this I/O bottleneck, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1915", "title": "Evaluating OS Page Cache for Stochastic ML Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you rely on POSIX individual-file reads and the OS page cache for the 100 TB ViT-H dataset, and what should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1916", "title": "SFU Bottlenecks in Custom Routing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the custom MoE gating function taking 35% of the forward pass despite being O(N), and how would you remove the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1917", "title": "Calculating Effective Bandwidth Under Stochastic I/O", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you eliminate the 500 MB/s random-read bottleneck and reach the 1.6 GB/s needed for 16,000 images/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1918", "title": "Evaluating Dedicated SFUs vs Vector ALUs for Transformers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator architecture is better for Transformer serving: flexible ALUs or 2-cycle SFUs that cost 12% die area, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1919", "title": "Profiling SFU vs Memory Bottlenecks in GeLU", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 8192 x 32768 FP16 GeLU on an A100, is the standalone kernel SFU-compute-bound or HBM-bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1920", "title": "Covariance Pruning for Backdoors", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many examples should you prune from the suspicious 50,000-example class using a 1.5x margin over a 0.5% poison rate?", "chain_ids": ["cloud-chain-auto-003-20"], "chain_positions": {"cloud-chain-auto-003-20": 0}, "chain_tiers": {"cloud-chain-auto-003-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1921", "title": "Evaluating Spectral Signatures Computation", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the team's concern about a prohibitive 1.2M-vector SVD valid, and how should Spectral Signatures actually be computed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1922", "title": "Diagnosing Multi-Tenant Cache Side-Channels", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hardware-level vulnerability is leaking Tenant A's weights via branch mispredictions and L3 cache timing, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1923", "title": "Identifying Poisoned Data via Spectral Signatures", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use the 4.5x top singular value in the suspected class to identify and filter poisoned examples?", "chain_ids": ["cloud-chain-auto-003-20"], "chain_positions": {"cloud-chain-auto-003-20": 1}, "chain_tiers": {"cloud-chain-auto-003-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1925", "title": "Mitigating Spectre in Multi-Tenant NLP Gateways", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you enforce IBPB and disable SMT for the shared EPYC tokenization tier, or use stronger isolation instead, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1926", "title": "Spot Instance Preemption TCO", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the expected total cost and wall-clock time of the 100-hour 8-node run on Spot versus On-Demand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1927", "title": "Diagnosing Spot Preemption Hangs in Distributed Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this specific failure pattern, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1928", "title": "Mitigating Watermark Spurious Correlations via Group Sampling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What resampling weights are needed to perfectly balance Watermark/No-Watermark x Positive/Negative groups in each 1024-image batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1930", "title": "Evaluating Spot Instance Cost vs Recovery Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do pure On-Demand and pure Spot compare for the 14-day 13B LLM run in total cost and time-to-market?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1932", "title": "Diagnosing Missing Deployment Invariants", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What structural invariant is missing from the Evaluation-to-Deployment contract for the 7B FP16 model on 24GB instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1933", "title": "VRAM Constraint Validation for Model Promotion", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does the FP16 7B model satisfy the T4 staging memory contract, and if not what minimal quantization bit-width passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1934", "title": "LLM Serving Stage Invariants", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What static invariants and dynamic profiling checks should the Optimization-to-Serving contract use to guarantee TBT < 50ms on L4 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1935", "title": "Diagnosing Staged Data Pipeline Starvation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you remove the 210ms sequential data-pipeline stall and maximize utilization on the node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1937", "title": "Optimizing Retraining Interval for E-Commerce", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What retraining interval minimizes average daily compute plus staleness cost for the recommendation model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1938", "title": "Diagnosing Optimal Retraining Frequency for Ad CTR Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Is retraining every 12 hours optimal for the CTR model, or what retraining interval minimizes compute plus staleness cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1939", "title": "Optimizing the Retraining Cadence", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What static retraining interval minimizes total business cost for the ad-click model given $5,000 retrains and rising daily staleness loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1940", "title": "Evaluating Staged Pipeline Bottlenecks", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which throughput improvement option—PCIe Gen5, more num_workers, or GPU DALI augmentation—should you choose, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1941", "title": "Stateful LLM Serving Fault Tolerance", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which fault-tolerance strategy should you use to minimize P99 recovery latency and operational overhead, and how do they quantitatively compare?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 2}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1942", "title": "Cascading KV Cache OOMs in Stateful LLM Serving", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did reshuffling 2,000 stateful LLM sessions cause OOMs and 15s P99 TTFT, and how should failover be redesigned?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 1}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1943", "title": "Stateful KV Cache Recovery Tradeoffs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 400 crashed sessions, is a Redis KV-cache restore over a 100 Gbps network or a full recompute prefill faster, and what are the recovery times?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 0}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1944", "title": "LLM Stateless Serving Database Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the stateless LLM serving design bottleneck on PostgreSQL at 5,000 req/sec despite adding GPU worker pods?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1945", "title": "Calculating Maximum Static Batch Size for Strict SLA", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If your strict P99 latency SLA is 50ms, what is the maximum static batch size you can configure to ensure the first request in any batch meets the SLA limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1946", "title": "Stateless vs Stateful LLM Serving Architecture", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 70B chat service use stateless routing or stateful sticky KV-cache serving, and how do failures change the decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1947", "title": "Off-Peak Latency Spikes in Static Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this performance degradation under low load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1948", "title": "Diagnosing Static Graph Compilation OOM", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does static graph JIT compilation OOM on a 256GB host for dynamic sequence lengths, and how would you bound the compiler memory?", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 2}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1950", "title": "Memory Traffic Savings via Operator Fusion", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much HBM traffic do eager versus fused static execution use for the three FP16 pointwise ops, and how long should the fused kernel take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1951", "title": "Evaluating Static Graphs for DLRM Inference", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the DLRM serving stack use PyTorch eager mode or an ahead-of-time static graph, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1952", "title": "Diagnosing Peak Load Bottlenecks in Recommendation Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are peak login requests saturating the GPUs, and how would static inference eliminate the latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1953", "title": "Scaling DLRM Static Inference Caching", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long will 8 GPUs take to refresh recommendations for 50M users, and how much raw Redis memory stores top-50 64-bit IDs per user?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1954", "title": "Diagnosing High Idle Power in GPU Clusters", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the H100 cluster still draw 60% of peak power at 20% utilization after inlet temperatures rise to 28°C?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1955", "title": "Datacenter Idle GPU Static Power Scaling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total facility-level power wasted by the 800 idle GPUs at 70°C with a datacenter PUE of 1.2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1956", "title": "Datacenter Cooling Trade-off: Static Power vs CapEx", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the $4M liquid cooling system justified on energy savings alone over 1 year for the 8,000 H100 training cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1957", "title": "Quantifying Non-Stationarity Financial Impact", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What daily revenue loss does the Q3 drift cause, and how quickly must detection and retraining finish to keep losses below $50,000?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1958", "title": "Evaluating Static Inference Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you precompute all 20M feed profiles in Redis, keep dynamic T4 inference, or use a hybrid architecture at 50,000 QPS, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1960", "title": "Diagnosing Loan Approval Parity", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the Statistical Parity Difference for Groups A and B, and why is 92% accuracy not enough for compliance?", "chain_ids": ["cloud-chain-auto-003-07"], "chain_positions": {"cloud-chain-auto-003-07": 1}, "chain_tiers": {"cloud-chain-auto-003-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1961", "title": "Evaluating Stationarity Violations in Cloud ML", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use weekly sliding-window retraining ($1,200/week) or real-time online learning to adapt to this shifted loan model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1963", "title": "KV-Cache Affinity in Canary Rollouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many sessions lose KV-cache affinity when adding 10 canary nodes with modulo hashing versus consistent hashing?", "chain_ids": ["cloud-chain-auto-001-16"], "chain_positions": {"cloud-chain-auto-001-16": 0}, "chain_tiers": {"cloud-chain-auto-001-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1964", "title": "Evaluating L7 Sticky Routing for LLM Canary Rollouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you pin chats with L7 consistent-hash sticky routing on session_id or a model-version JWT during the 10% canary, and why?", "chain_ids": ["cloud-chain-auto-001-16"], "chain_positions": {"cloud-chain-auto-001-16": 2}, "chain_tiers": {"cloud-chain-auto-001-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1965", "title": "Diagnosing Context Amnesia in Canary Rollouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the context amnesia during the 10% canary, and what networking-level fix would you implement?", "chain_ids": ["cloud-chain-auto-001-16"], "chain_positions": {"cloud-chain-auto-001-16": 1}, "chain_tiers": {"cloud-chain-auto-001-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1966", "title": "Diagnosing Edge-to-Cloud Satellite Storage Saturation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can the 5 Mbps, 6-hour satellite window offload 50GB/day, and how should the pipeline avoid filling the 2TB buffer?", "chain_ids": ["cloud-chain-auto-002-07"], "chain_positions": {"cloud-chain-auto-002-07": 1}, "chain_tiers": {"cloud-chain-auto-002-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1968", "title": "Sizing Edge Buffers for Satellite ML Pipelines", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the nightly transfer capacity, is it enough for 180GB/day, and how large an SSD buffer is needed for 4 blackout days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1969", "title": "Edge-to-Cloud Telemetry Ingestion Architecture", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the edge devices use synchronous daytime streaming or store-and-forward over the 2-hour satellite window, and why?", "chain_ids": ["cloud-chain-auto-002-07"], "chain_positions": {"cloud-chain-auto-002-07": 2}, "chain_tiers": {"cloud-chain-auto-002-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1970", "title": "Federated Learning Over-selection for Stragglers", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many clients should be selected to get 1,000 fast updates without stragglers, and what round latency reduction results?", "chain_ids": ["cloud-chain-auto-004-09"], "chain_positions": {"cloud-chain-auto-004-09": 0}, "chain_tiers": {"cloud-chain-auto-004-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1971", "title": "Diagnosing Cross-Silo Straggler Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you mitigate this straggler effect while preserving the statistical contribution of high-latency nodes?", "chain_ids": ["cloud-chain-auto-004-09"], "chain_positions": {"cloud-chain-auto-004-09": 1}, "chain_tiers": {"cloud-chain-auto-004-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1972", "title": "Evaluating Straggler Mitigation Strategies in Federated Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the trade-offs: should you switch to FedAsync, or over-select 1,500 clients and wait for the fastest 1,000?", "chain_ids": ["cloud-chain-auto-004-09"], "chain_positions": {"cloud-chain-auto-004-09": 2}, "chain_tiers": {"cloud-chain-auto-004-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1973", "title": "Streaming Architecture for Strict Ad-Bidding SLAs", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you choose Flink with local state or Spark Streaming with Redis for 500,000 bid requests/sec under a 30ms P99 SLA, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-15"], "chain_positions": {"cloud-chain-auto-secondary-015-15": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1975", "title": "Diagnosing Single-Threaded Consumer Lag in Kafka", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the ingestion bottleneck when one CPU core is pegged and the other 15 are idle?", "chain_ids": ["cloud-chain-auto-secondary-015-15"], "chain_positions": {"cloud-chain-auto-secondary-015-15": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1976", "title": "Streaming Inference Batch Size Limits", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If you buffer chunks from a *single* audio stream to increase throughput, what is the maximum batch size you can accumulate without violating the SLA for the first chunk in the batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1977", "title": "A100 2:4 Structured Sparsity Memory Overhead", "topic": "extreme-quantization", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the compressed memory footprint of the 8192x8192 FP16 matrix under 2:4 sparsity, including metadata, and what are the exact memory savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1978", "title": "Evaluating 2:4 Structured Sparsity on A100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you deploy 70% unstructured pruning or 2:4 structured sparsity for the 7B model on A100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1979", "title": "Unstructured vs Structured Sparsity Slowdown on A100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did 75% unstructured pruning slow inference on A100s, and what sparsity mechanism is needed for acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1980", "title": "Streaming LLM TBT Spikes", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 TBT spike at 40 streams on the 24GB instance, and what serving memory strategy fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1981", "title": "Stateful Streaming vs Stateless Batching ASR", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which ASR architecture should you deploy to meet the <300ms TTFW SLA for 5,000 sessions, and how would you size it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1983", "title": "SUTVA Violations in Dispatch A/B Testing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the measured A/B delta and true rollout impact on rider wait time, and what test design fixes the SUTVA violation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1985", "title": "Diagnosing Synchronous Checkpoint Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the primary bottleneck causing 5-minute synchronous checkpoints when interconnect utilization is under 1%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1986", "title": "Evaluating Synchronous Checkpointing Overheads", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the synchronous checkpointing overhead for the 4.8TB state, and is a two-tier local-NVMe asynchronous strategy justified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1987", "title": "Synchronous Checkpoint Overhead Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long does each checkpoint pause training for a 300B mixed-precision Adam state, and what percentage overhead does a 20-minute cadence add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1988", "title": "Roofline Benchmarking of Custom Layers", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the layer's arithmetic intensity, Roofline-limited throughput, and bound type on the A100?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 1}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1989", "title": "Predicting Model Degradation Thresholds", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "When must retraining start to keep accuracy above 85.0%, and what is the annual compute cost of that retraining cadence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1990", "title": "Diagnosing CTR Degradation Post-Peak Event", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this silent failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1991", "title": "Evaluating LLM Inference Benchmarks", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is the vendor's 3x latency reduction claim valid for batch-1 70B LLM serving, and how should you benchmark the real speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1992", "title": "Diagnosing Low TFLOPS in Single-Batch Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does batch-1 autoregressive inference achieve only 2-4 TFLOPS instead of the advertised 312 TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1993", "title": "Evaluating Mitigation Strategies for Recommendation System Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 100GB DLRM use daily full retraining or continuous 15-minute online learning to fight -0.5% weekly CTR drift, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1994", "title": "LLM Activation Observability Offload Overhead", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How large is the FP16 activation payload for 5 layers with batch size 16, sequence length 1024, and hidden size 8192?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1995", "title": "Diagnosing Explainability Control Plane Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 400ms/token P99 spike, and how would you extract the 2GB of state without starving the GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1996", "title": "Systolic Array Peak Throughput Estimation", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the accelerator's theoretical peak BF16 throughput in TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1997", "title": "Systolic Array Sizing and Padding Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture is more efficient for the 64x1024 QKV projections without cross-request batching, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1998", "title": "Systolic Array Padding Underutilization: Transformer Systems Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the <20% TPU utilization for the [64,100] x [100,64] dense layer, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1999", "title": "Fan-out Tail Latency Probability", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What percentage of scatter-gather requests will see 200ms or higher latency across all 50 services?", "chain_ids": ["cloud-chain-auto-025-15"], "chain_positions": {"cloud-chain-auto-025-15": 0}, "chain_tiers": {"cloud-chain-auto-025-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2000", "title": "Evaluating Hedged Requests for Fan-Out Inference", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which option should you use to meet the 50ms system P99 SLA: hedged requests at 25ms or 2x GPU over-provisioning, and why?", "chain_ids": ["cloud-chain-auto-025-15"], "chain_positions": {"cloud-chain-auto-025-15": 2}, "chain_tiers": {"cloud-chain-auto-025-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2001", "title": "LLM Control Plane Explainability Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you log full layer activations or lightweight linear-probe scores for real-time explainability at 5,000 QPS, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2002", "title": "Diagnosing Parallel Fan-out Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Diagnose why the median user latency degraded in the 50-way scatter-gather system, and calculate the chance of hitting the 150ms tail.", "chain_ids": ["cloud-chain-auto-025-15"], "chain_positions": {"cloud-chain-auto-025-15": 1}, "chain_tiers": {"cloud-chain-auto-025-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2003", "title": "Evaluating Telemetry Aggregation for LLM Inference Fleets", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use 1% random sampling or node-level sidecar aggregation for the 50,000 QPS telemetry pipeline, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2004", "title": "LLM Inference Trace Saturation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is saturating the 100 Gbps observability uplink, and what node-level aggregation strategy would you deploy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2005", "title": "Edge Telemetry Aggregation for High-Throughput Inference", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total egress bandwidth results from keeping 100% of errors, sampling 1% of successes, and sending 100 KB/s of metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2006", "title": "Tensor Core Bottlenecks in GEMM", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the arithmetic intensity, bottleneck type, and minimum execution time for the M=N=K=8192 FP16 GEMM on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2007", "title": "Evaluating Tensor Memory Layouts for Tensor Cores", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you switch the ResNet pipeline to NHWC despite three NCHW-only custom kernels, and when is the conversion cost justified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2008", "title": "Diagnosing Tensor Core Underutilization", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze and resolve the bottleneck preventing high Tensor Core utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2009", "title": "Diagnosing Non-Contiguous Tensor Memory Bottlenecks", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this degradation at the tensor abstraction level, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2010", "title": "Tensor Contiguity Memory Overhead", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the exact High Bandwidth Memory (HBM) traffic (in MiB) generated strictly by this `.contiguous()` operation.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2011", "title": "Optimizing Tensor Core Utilization via Dimension Padding", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you pad the FFN intermediate dimension from 3000 to 3072, and how do the Tensor Core trade-offs justify it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2012", "title": "Evaluating Payload-to-Tensor Transformation Architectures", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should clients send uncompressed FP16 NCHW tensors, or should Triton accept JPEGs and use DALI for GPU preprocessing, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2013", "title": "JSON to Tensor Parsing Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the JSON parsing overhead for the batch of 32 images, and what transport and preprocessing changes would fix the latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2014", "title": "CPU Bottleneck in Tensor Format Transformation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 45ms latency despite 1.5ms GPU compute, and how would you redesign the input path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2015", "title": "Diagnosing MFU Collapse in Cross-Node Tensor Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did increasing Tensor Parallelism from TP=8 to TP=16 collapse MFU, and how should parallelism cross node boundaries instead?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 1}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2016", "title": "Transposed Tensor Bandwidth Collapse", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does transposing the [32,16,4096,128] tensor before the custom kernel destroy memory bandwidth, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-017-39"], "chain_positions": {"cloud-chain-auto-secondary-017-39": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-39": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2017", "title": "Evaluating TP Topology Across Network Boundaries", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you serve the 175B model with TP=16 across nodes or TP=8 within each node plus PP=2 across nodes, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2018", "title": "Calculating NCHW Tensor Memory Offsets", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the linear element offset for index [2,128,32,16] in the contiguous NCHW [8,256,64,64] FP32 tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2019", "title": "Calculating 1D Tensor Parallelism Overhead", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For TP=8, what are the per-GPU MLP weight footprint and the All-Reduce activation payload size for the micro-batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2020", "title": "Evaluating NCHW vs NHWC Layouts", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you leave the ResNet-101 in NCHW with implicit cuDNN transposes, or switch globally to NHWC, and why?", "chain_ids": ["cloud-chain-auto-secondary-017-39"], "chain_positions": {"cloud-chain-auto-secondary-017-39": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-39": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2021", "title": "TensorRT Fusion and Quantization Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the expected TensorRT-optimized batch latency after 5x kernel fusion and 4x INT8 data-movement reduction?", "chain_ids": ["cloud-chain-auto-005-12"], "chain_positions": {"cloud-chain-auto-005-12": 0}, "chain_tiers": {"cloud-chain-auto-005-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2022", "title": "TensorRT Dynamic Shapes Trade-offs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use one static 512-token FP16 engine or INT8 dynamic shapes with optimization profiles for the 10ms P99 SLA, and why?", "chain_ids": ["cloud-chain-auto-005-12"], "chain_positions": {"cloud-chain-auto-005-12": 2}, "chain_tiers": {"cloud-chain-auto-005-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2023", "title": "Debugging TensorRT Precision Fallbacks and Fusion Breaks", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the FP16 TensorRT engine 2x slower than expected, and how would you eliminate the formatting nodes?", "chain_ids": ["cloud-chain-auto-005-12"], "chain_positions": {"cloud-chain-auto-005-12": 1}, "chain_tiers": {"cloud-chain-auto-005-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2024", "title": "Evaluating Test-Time Compute vs Model Scaling", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 500-token reasoning outputs under 10s P95, should you upgrade to a 400B model or run Best-of-16 on the 70B model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2025", "title": "Diagnosing CoT Test-Time Scaling Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 8,000-token CoT scratchpads cause low SM utilization and OOMs, and how would you control KV-cache pressure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2027", "title": "Batched Test-Time Scaling Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the end-to-end latency for the Best-of-16 request with prefix caching, and how does it compare to Best-of-1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2028", "title": "Diagnosing and Mitigating Equal Opportunity Violations", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you equalize the minority group’s TPR to 92% without retraining, and what happens to the review queue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2029", "title": "Post-Hoc Threshold Adjustment for Equal Opportunity", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would post-hoc threshold adjustment equalize TPRs for Groups X and Y, and what trade-offs does it create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2030", "title": "The Embedding Index Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory does just the raw vector data require, before any graph overhead?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 0}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2031", "title": "The ANN Recall-Latency Tradeoff", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does improving recall have such a steep latency cost, and what is the fundamental tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2032", "title": "The RAG Latency Composition", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total end-to-end time to first token, and what is the total request time?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 1}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2033", "title": "The Hybrid Search Score Fusion Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you combine these heterogeneous score distributions for meaningful ranking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2034", "title": "The Embedding Dimension Cost Curve", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What infrastructure cost difference does the 1536-dim embedding model create for a 100M-document index versus 384-dim?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2035", "title": "The RAG Context Window Overflow", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the primary cause of the degraded answer quality despite being within the context window limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2036", "title": "The Vector Index Staleness Window", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the systems root cause, and what architectural pattern solves it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2037", "title": "The Cascading Retry Storm", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the system go down for 20 minutes when the LLM only spiked for 3 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2038", "title": "The Agent Tool-Call Latency Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many sequential tool calls can you afford, and how do you design the system to stay within budget?", "chain_ids": ["cloud-chain-auto-016-05"], "chain_positions": {"cloud-chain-auto-016-05": 0}, "chain_tiers": {"cloud-chain-auto-016-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2039", "title": "The Embedding Cache Hit Rate Cliff", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did scaling users destroy your cache economics, and how do you fix the hit rate collapse?", "chain_ids": ["cloud-chain-auto-016-04"], "chain_positions": {"cloud-chain-auto-016-04": 0}, "chain_tiers": {"cloud-chain-auto-016-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2040", "title": "The Reranker Bottleneck Inversion", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the extra 210ms coming from?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 2}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2041", "title": "The Multi-Model GPU Packing Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can you fit all 4 models on one GPU, and what happens if you try?", "chain_ids": ["cloud-chain-auto-016-02"], "chain_positions": {"cloud-chain-auto-016-02": 2}, "chain_tiers": {"cloud-chain-auto-016-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2042", "title": "The Product Quantization Memory Tradeoff", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much memory does the PQ-compressed index require, and what is the recall cost?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 1}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2043", "title": "The DAG Critical Path Optimization", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which investment has higher ROI given the current performance against the 2.5s SLA?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 3}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2044", "title": "The RAG Cache Invalidation Dilemma", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does this update propagate through the caching system, and what is the correct invalidation strategy?", "chain_ids": ["cloud-chain-auto-016-04"], "chain_positions": {"cloud-chain-auto-016-04": 2}, "chain_tiers": {"cloud-chain-auto-016-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2045", "title": "The Vector DB Sharding Strategy", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you shard by document ID range (range-based) or by random hash (hash-based)?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 2}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2046", "title": "The Compound System Evaluation Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you isolate which component caused the regression, and why is this fundamentally harder than debugging a single model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2047", "title": "The Agent Loop Cost Explosion", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the daily inference cost, and how does it compare to a non-agentic RAG system?", "chain_ids": ["cloud-chain-auto-016-05"], "chain_positions": {"cloud-chain-auto-016-05": 1}, "chain_tiers": {"cloud-chain-auto-016-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2048", "title": "The Tool-Use Timeout Cascade", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the probability that this 2-tool sequence completes within the timeout, and how should you set per-tool timeouts?", "chain_ids": ["cloud-chain-auto-016-05"], "chain_positions": {"cloud-chain-auto-016-05": 2}, "chain_tiers": {"cloud-chain-auto-016-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2049", "title": "The Semantic Cache Collision Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you fix semantic caching without losing its cost benefits?", "chain_ids": ["cloud-chain-auto-016-04"], "chain_positions": {"cloud-chain-auto-016-04": 1}, "chain_tiers": {"cloud-chain-auto-016-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2050", "title": "The Retrieval Scaling Wall", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does 100x more data cause both latency and quality to degrade, and what architectural changes are needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2051", "title": "The Embedding Model Drift Crisis", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you manage the transition to the new embedding model without degrading retrieval quality or causing downtime?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 3}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2052", "title": "The Compound System Tail Latency Amplification", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the junior engineer's P99 analysis correct, what is the actual end-to-end P99, and what can you do about it?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 4}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2053", "title": "The Multi-Agent Consistency Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the 3-agent financial analysis system to show progressive results while preserving factual consistency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2054", "title": "The Model Cascade Routing Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What routing strategy achieves 95% effective accuracy while minimizing cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2055", "title": "The RAG vs Fine-Tuning Breakeven Analysis", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach has lower TCO, and at what query volume does the breakeven point shift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2056", "title": "ML vs Software Development Lifecycle", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the fundamental difference between traditional software development and ML development that makes the standard software lifecycle insufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2057", "title": "The Experiment Tracking Storage Budget", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much storage does your experiment tracking system consume per year, and at what point does this become an infrastructure concern?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 0}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2058", "title": "The Iteration Speed Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much does this improve the overall iteration cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2059", "title": "The Feedback Loop Latency Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is this feedback loop asymmetry the single biggest productivity bottleneck in ML engineering?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 0}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2060", "title": "The Wasted Training Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the monthly dollar cost of failed experiments?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2061", "title": "Shift-Left Validation for Serving Constraints", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What pre-training validation would prevent these failed mobile runs, and how much would it save at 10 models per month?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2062", "title": "The Hyperparameter Search Efficiency Gap", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is wrong with the grid search approach, and how do you reduce the budget while maintaining similar coverage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2063", "title": "The Offline-Online Metric Gap", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the three most likely causes of this offline-online metric gap?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 1}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2064", "title": "The Data Cascade Failure", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why are data cascades particularly dangerous in ML workflows, and what structural practice would have caught this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2065", "title": "The Experiment Metadata Tax", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What went wrong, and what minimum metadata must be tracked per experiment to prevent this?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 1}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2066", "title": "Early Stopping vs Checkpoint Recovery", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "After 12 hours of rising validation loss, should you stop at the hour-60 checkpoint or continue with a lower learning rate, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2068", "title": "The Silent Training Regression", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the 40% support-ticket spike after a weekly retrain when aggregate validation accuracy looked normal?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 2}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2069", "title": "The Multi-Objective Experiment Frontier", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Of those 12 feasible experiments, how do you systematically choose the best model?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 3}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2070", "title": "The Pipeline Debt Diagnosis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the three most likely sources of the 2-to-8-week production slowdown as the team grew to 15 engineers, and how would you fix them?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 2}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2072", "title": "The Experiment Leakage Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Before celebrating, what should you check first, and why does the training curve shape suggest a specific failure mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2073", "title": "The GPU Cluster Utilization Mystery", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can GPUs be 65% idle while engineers wait 12 hours, and what scheduling changes would fix the cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2074", "title": "The Canary Evaluation Framework", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What metrics must an automated evaluation gate track, what thresholds should trigger rollback, and how do you handle the tension between deployment velocity and safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2075", "title": "The Training Pipeline Observability Gap", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What instrumentation should have been in place, and how would you make the kill-vs-continue decision with only the training loss curve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2076", "title": "The End-to-End Iteration Tax Audit", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where do you invest engineering effort, and how do you achieve a 3x speedup when every phase seems equally important?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2077", "title": "The ML Platform Architecture Decision", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you design the platform architecture, and what is the hardest technical challenge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2078", "title": "The Experiment-to-Production Gap", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why do the reported compute, latency, and accuracy gaps exist, and how does the ML development lifecycle fundamentally produce them?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 3}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2079", "title": "The LLM Evaluation Pipeline Design", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design an evaluation pipeline that provides high-confidence quality signals within a 2-day deployment cadence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2080", "title": "The Retraining Trigger Strategy", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate fixed-schedule, performance-triggered, and drift-triggered retraining for a high-stakes medical diagnosis model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2081", "title": "The Training Failure Recovery Architecture", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a resilient training workflow to minimize wasted compute from GPU faults, OOMs, NaNs, and data stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2082", "title": "Attention's Quadratic Memory Wall", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If you double the sequence length from 2K to 4K tokens, by how much does the attention score matrix memory grow?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2x — attention memory scales linearly with sequence length. (Trap: linear scaling of KV cache applied to attention matrix)", "4x — the attention score matrix is S x S, so doubling S quadruples memory.", "8x — attention has cubic scaling due to the three projection matrices Q, K, V. (Trap: confusing N^3 matrix multiplication complexity with N^2 memory complexity)", "It stays the same — attention memory depends only on model dimension, not sequence length. (Trap: confusing model parameters with activation memory)"], "correct_index": 1}}, {"id": "cloud-2083", "title": "CNN vs Transformer Arithmetic Intensity", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Which layer typically has higher arithmetic intensity (FLOPs per byte), and what does that imply about their hardware bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Attention has higher arithmetic intensity because it involves more matrix multiplies.", "Convolutions have higher arithmetic intensity due to massive weight reuse across spatial positions.", "They have identical arithmetic intensity since both perform dot products.", "Neither is meaningful to compare because arithmetic intensity only applies to fully-connected layers."], "correct_index": 1}}, {"id": "cloud-2084", "title": "Embedding Table Bandwidth Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is this workload almost always memory-bandwidth-bound rather than compute-bound?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 0}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Because the FP32 multiply-accumulate operations for 128-dim vectors are expensive on GPU cores.", "Because the model's MLP layers after the embedding are compute-heavy.", "Because each lookup is a random memory read with near-zero arithmetic — making it a pure bandwidth-bound workload.", "Because the embedding table doesn't fit in L2 cache, forcing reads from system DRAM."], "correct_index": 2}}, {"id": "cloud-2085", "title": "RNN Sequential Dependency vs GPU Parallelism", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why did transformers largely replace RNNs in production NLP systems, from a systems perspective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["RNNs require more total FLOPs than transformers for the same sequence length.", "RNN hidden states create sequential dependencies that prevent parallel execution across timesteps, severely underutilizing GPU hardware.", "Transformers use less memory than RNNs because they don't store hidden states.", "GPU hardware is physically incapable of running recurrent operations."], "correct_index": 1}}, {"id": "cloud-2086", "title": "MoE AllToAll Communication Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the AllToAll communication volume per MoE layer for 4096 top-2-routed BF16 tokens across 8 GPUs?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 0}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 MB — each GPU only sends tokens it can't process locally.", "~64 MB — tokens dispatched once to the selected experts.", "~128 MB — tokens dispatched to top-2 experts and results returned.", "~512 MB — all tokens are broadcast to all 8 GPUs."], "correct_index": 2}}, {"id": "cloud-2087", "title": "KV Cache Memory Per Token", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the BF16 KV cache memory per token with 8 KV heads, and the total cache for a 4096-token context?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 0}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~40 KiB per token, ~160 MiB for 4K context.", "~320 KiB per token, ~1.25 GiB for 4K context.", "~2.5 MiB per token, ~10 GiB for 4K context.", "~5 MiB per token, ~20 GiB for 4K context."], "correct_index": 1}}, {"id": "cloud-2088", "title": "im2col Memory Expansion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does im2col transform convolution into matrix multiplication, and what is the memory overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["im2col compresses the convolution into a 1D vector, saving memory.", "im2col creates a Toeplitz matrix that has the same size as the original input.", "im2col unrolls overlapping receptive fields into columns, duplicating data up to 9x for 3x3 filters.", "im2col only affects compute time, not memory — it reindexes data in-place."], "correct_index": 2}}, {"id": "cloud-2089", "title": "Flash Attention Tiling Strategy", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the core systems insight behind Flash Attention's speedup, and why does SRAM tiling change the bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-013-05"], "chain_positions": {"cloud-chain-auto-secondary-013-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2090", "title": "MoE Expert Load Imbalance Stall", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely root cause of this synchronization stall, and what is the throughput impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2091", "title": "Prefill vs Decode Compute Profile", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is LLM prefill compute-bound while decode is memory-bandwidth-bound when viewed through arithmetic intensity?", "chain_ids": ["cloud-chain-auto-006-02"], "chain_positions": {"cloud-chain-auto-006-02": 0}, "chain_tiers": {"cloud-chain-auto-006-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2092", "title": "GQA vs MQA KV Cache Tradeoff", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do MHA, GQA, and MQA compare in KV cache size and serving throughput implications for a 30B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2093", "title": "SSM vs Transformer Hardware Tradeoff", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When does each architecture have a systems advantage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2094", "title": "Transformer Layer FLOP Decomposition", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the attention and FFN FLOPs break down for this layer, and which component dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2095", "title": "DLRM Embedding Table Sharding Strategy", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you shard the 400 GB embeddings across the 8 GPUs, and why not shard every table evenly across all GPUs?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 3}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2096", "title": "MoE Sparse vs Dense FLOP Equivalence", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do Mixtral 8x7B per-token FLOPs compare to dense 47B and 13B models, and why does MoE get more capacity for fewer FLOPs?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 2}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2097", "title": "Attention GPU Utilization Drop at Long Sequences", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does prefill GPU utilization drop from 55% on 256-token prompts to 25% on 16K-token prompts despite more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2098", "title": "MoE Capacity Factor Tuning", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you reason about the optimal CF?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2099", "title": "Transformer Architecture Enables Tensor Parallelism", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do multi-head attention and FFN layers naturally decompose across GPUs in Tensor Parallelism, and where are the synchronization points?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2100", "title": "Sparse Attention Patterns Meet Hardware Reality", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the theoretical FLOP reduction not translate to proportional wall-clock speedup?", "chain_ids": ["cloud-chain-auto-secondary-013-05"], "chain_positions": {"cloud-chain-auto-secondary-013-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2101", "title": "Embedding Table Hot-Cold Partitioning", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a tiered caching architecture for the 200 GB embedding table using the 80/20 Zipfian access pattern?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2102", "title": "MoE Expert Parallelism vs Tensor Parallelism", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When should you use expert parallelism versus tensor parallelism for 128-expert MoE inference, and what are the tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2103", "title": "Architecture Choice Drives Serving Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture minimizes $/query between a dense 70B transformer vs a 47B-parameter MoE (8x7B, top-2) vs a 70B-equivalent SSM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2105", "title": "DLRM Training Pipeline End-to-End Design", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are the three critical systems bottlenecks for social-scale DLRM training with hourly updates, and how should the architecture address them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2106", "title": "Multi-Architecture Serving Fleet Design", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you allocate 256 GPUs and choose parallelism strategies to serve the 175B dense transformer, 500B MoE, and 2 TB DLRM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2107", "title": "KV Cache Quantization Quality-Throughput Frontier", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What KV-cache quantization system would you deploy to get 4× throughput without adding GPUs while keeping quality loss under 1%?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 3}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2108", "title": "MoE Inference Memory Efficiency Problem", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you improve HBM memory utilization for the 128-expert 800B MoE without degrading inference latency?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 4}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2110", "title": "Arithmetic Intensity of a Linear Layer", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of Y=XW for batch size B, and when is it compute-bound versus memory-bound on an A100?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2112", "title": "Gradient Checkpointing: The Memory-Compute Trade", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With gradient checkpointing every 10 layers, what activation memory and training-time compute overhead should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2113", "title": "Forward vs Reverse Mode Autodiff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would forward-mode AD be catastrophically expensive for a 1B-parameter scalar-loss transformer, and when is it preferable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2114", "title": "The Dying ReLU Diagnosis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the root cause of the 87% zero activations, and what are two systems-aware mitigations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2115", "title": "The LayerNorm Bandwidth Tax", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does LayerNorm take 8% of wall time despite only 0.1% of FLOPs, and why is kernel fusion critical?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2116", "title": "Computational Graph Memory Lifetime Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do transformer activation lifetimes differ from a strict LIFO stack, and why does that matter for GPU memory fragmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2117", "title": "Vanishing Gradients and Depth Limits", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What causes the layer-1 gradient to be 1e-15 versus 0.1 at layer 50, and what systems consequence does it have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2119", "title": "The Hidden Cost of Dynamic Graphs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the actual bottleneck causing the GPU launch gaps, and what two concrete fixes would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2121", "title": "Designing Mixed-Precision Backpropagation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should tensors flow through the 70B LLM mixed-precision forward-backward-update cycle, including FP32 loss, loss scaling, and master weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2122", "title": "Exploiting Activation Sparsity in FFN Layers", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will 90% ReLU activation sparsity yield a near-10x FFN speedup by skipping zero rows, and what hardware obstacles limit it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2123", "title": "BatchNorm Breaks in Distributed Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ResNet-50 accuracy drop when local batch size falls from 256 to 4 across 64 GPUs, and what are two fixes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2124", "title": "Designing an Autograd Engine From Scratch", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the autograd engine's data structures and backward algorithm, and why is topological sort essential?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2125", "title": "Designing Around Non-Differentiable Operations", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you train through discrete decisions like hard MoE routing end-to-end, and what systems cost does your approach add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2126", "title": "Optimal Recompute-vs-Save Scheduling", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate activation checkpoint selection with a 33% recompute budget, and what practical algorithm would you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2127", "title": "The Feature Store Latency Budget", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can you serve this model within the SLA using sequential feature lookups, and why?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 0}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2128", "title": "The Embedding Table Memory Wall", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory do the embedding tables require, and can they fit on a single A100 (80GB HBM)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2129", "title": "The Training-Serving Skew Trap", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the most likely cause of the conversion rate drop?", "chain_ids": ["cloud-chain-auto-003-14"], "chain_positions": {"cloud-chain-auto-003-14": 0}, "chain_tiers": {"cloud-chain-auto-003-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2130", "title": "The Batch vs Real-Time Feature Tradeoff", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is this architecture fundamentally inadequate for this feature, and what is the minimum viable real-time path?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 0}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2131", "title": "The Feature Backfill Cost", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the compute cost of backfilling user_lifetime_spend_percentile for 100M users over 180 days of 1B transactions/day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2132", "title": "The Embedding Lookup Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "A junior engineer suggests upgrading to a GPU with 2x more TFLOPS. Will this help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2133", "title": "The Feature Interaction Explosion", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many new features does crossing 500 categorical features create, and what is the systems impact?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2134", "title": "The Point-in-Time Join Correctness", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What went wrong with the feature join, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2135", "title": "The Feature Freshness vs Cost Tradeoff", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you tier the 300 features to capture 90% of freshness value at about 20% of the $2M/year real-time cost?", "chain_ids": ["cloud-chain-auto-020-15"], "chain_positions": {"cloud-chain-auto-020-15": 2}, "chain_tiers": {"cloud-chain-auto-020-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2136", "title": "The Embedding Sharding Strategy", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does row-parallel sharding cause massive communication bottlenecks here, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2137", "title": "The Streaming Feature Consistency Trap", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you ensure exactly-once semantics for streaming features to prevent double-counting during rebalances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2138", "title": "The Feature Store Hot Key Problem", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you fix this single-shard saturation without re-architecting the entire feature store?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 2}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2139", "title": "The Feature Version Mismatch", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design feature versioning so models trained on min-max values cannot silently receive z-score values?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 2}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2140", "title": "The GPU Preprocessing Offload", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you move the 500-feature preprocessing pipeline to the GPU, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2141", "title": "The Unified Feature Store Architecture", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a feature store that guarantees training-serving consistency for 30 models by construction rather than by testing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2142", "title": "The Silent Feature Pipeline Failure", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you debug the 3-week conversion decline when systems dashboards are green but a feature pipeline issue is suspected?", "chain_ids": ["cloud-chain-auto-003-14"], "chain_positions": {"cloud-chain-auto-003-14": 2}, "chain_tiers": {"cloud-chain-auto-003-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2143", "title": "The Feature Serving Latency Decomposition", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you break down the 95ms feature-serving overhead and optimize P99 below the 100ms SLA?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 3}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 95ms is dominated by network latency; upgrade from 10 Gbps to 25 Gbps NICs and add edge caching to reduce RTT from 95ms to under 75ms", "Replace the feature store with an in-GPU embedding cache to eliminate all feature retrieval latency, reducing the pipeline to model inference time only (~25ms)", "Increase the SLA to 150ms since 120ms P99 is already competitive; feature retrieval latency is inherently sequential and cannot be parallelized across different feature sources", "Decompose the 95ms into feature store lookup (~25ms P99), real-time feature computation (~45ms P99), serialization (~5ms), and network RTT (~20ms); parallelize batch and real-time feature fetches to reduce total to ~95ms, meeting the 100ms SLA"], "correct_index": 3}}, {"id": "cloud-2144", "title": "The Multi-Model Feature Platform", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a centralized feature platform for 50 models that shares computation while preserving team independence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2145", "title": "The Embedding Serving at 10M QPS", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a 5ms-P99 embedding lookup service for 10M QPS over 2.56TB of embeddings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a 3-tier architecture: in-process hot cache (top 0.1% rows, ~2.56 GB, <0.1ms), distributed warm cache (top 5% rows, ~128 GB Redis, ~1ms), and SSD-backed cold store (full 2.56 TB, ~3ms); the 80/19/1 hit-rate split achieves P99 < 5ms across 200 serving nodes", "Serve all 2.56 TB from GPU HBM across 32 A100s (80 GB each) for sub-millisecond latency; GPU memory bandwidth of 2 TB/s can handle the 25.6 GB/s aggregate read load easily", "Shard the 2.56 TB across 256 Redis nodes (10 GB each) with consistent hashing; Redis provides ~0.5ms P99 per lookup, meeting the 5ms SLA without any tiered caching", "Use a single large NVMe SSD array with 2.56 TB capacity and SPDK for kernel-bypass reads at ~10 microseconds per lookup, far exceeding the 5ms SLA requirement"], "correct_index": 0}}, {"id": "cloud-2146", "title": "The End-to-End Feature Pipeline Redesign", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "With 6 months and 4 engineers, how would you architect, migrate, and validate a system that drives training-serving skew toward zero?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 3}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2148", "title": "The Gradient Accumulation Trick", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Without adding more GPUs, how do you achieve this effective batch size, and what is the computational cost?", "chain_ids": ["cloud-chain-auto-005-15"], "chain_positions": {"cloud-chain-auto-005-15": 0}, "chain_tiers": {"cloud-chain-auto-005-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2150", "title": "The Loss Spike at Step 50K", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the three most likely root causes of the loss spike, ordered from most to least common?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2151", "title": "The Warmup Necessity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is warmup critical for large models with Adam, and how do you calculate a reasonable warmup duration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2152", "title": "The FP16 Loss Scaling Dance", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What happened, and what mechanism should have prevented this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2153", "title": "The Activation Checkpointing Tradeoff", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the memory-compute tradeoff, and how do you decide which layers to checkpoint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2154", "title": "The Batch Size Scaling Wall", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did scaling GPT training to 512 GPUs and 8M-token batches hurt final quality despite faster steps and linear LR scaling?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 2}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2155", "title": "The Training Memory Budget Decomposition", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Walk through the complete memory budget decomposition and determine the maximum model size (in parameters) you can train.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2156", "title": "The Gradient Norm Anomaly", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Is this a healthy training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2158", "title": "The Pre-Training Cost Estimate", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is a realistic cost estimate to pre-train a 70B model on 2T tokens at $3/GPU-hour, including failure overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2159", "title": "The Slow Training Step Diagnosis", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you profile a 70B run on 256 GPUs to find why steps take 12s instead of the expected 4s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2160", "title": "The Chinchilla Scaling Decision", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option should you choose: 70B on 500B tokens, 25B on 500B tokens, or repeating the 500B tokens about 3 times, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Train a 25B model on 500B tokens to be Chinchilla-optimal; this maximizes training loss per FLOP and produces the best model possible within the data constraint", "Train the 70B model on 500B tokens (Chinchilla-undertrained) because inference cost dominates TCO — the 70B model at 500B tokens likely outperforms a 25B Chinchilla-optimal model on downstream tasks, and with INT4 quantization the serving cost premium is only ~1.5x", "Train the 70B model and repeat the 500B tokens 3x (1.5T tokens) to match the Chinchilla-optimal token count; data repetition has no diminishing returns for high-quality data", "Split the budget: train a 70B model on 500B tokens, then distill it into a 7B model for serving; this gives both quality and inference efficiency without wasting compute"], "correct_index": 1}}, {"id": "cloud-2161", "title": "The Gradient Accumulation vs. Data Parallelism Choice", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the systems-level tradeoffs between large micro-batches (A) and smaller micro-batches with gradient accumulation (B)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2162", "title": "The 10K-GPU Training Instability", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you systematically diagnose and recover from this reproducible loss divergence around step 80K?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2163", "title": "The Training Recipe Design", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What complete training configuration would you choose for the 40B model on 1,024 GPUs, and how would you justify it quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use speculative decoding: a small 7B draft model generates 5-8 candidate tokens autoregressively, then the 70B target model verifies all candidates in a single forward pass; this converts sequential decode steps into parallel verification, achieving 2-3x speedup without quality loss since rejected tokens are resampled from the target distribution", "Increase the batch size to amortize the memory-bandwidth cost of loading 140 GB of weights per token; with batch size 256 the arithmetic intensity exceeds the roofline knee point", "Quantize the model to INT4 (35 GB) to double the effective memory bandwidth from 3.35 TB/s to 6.7 TB/s equivalent, directly doubling the token generation rate", "Use continuous batching to interleave prefill and decode phases across requests; this improves throughput but does not reduce per-request latency for the single-user scenario described"], "correct_index": 0}}, {"id": "cloud-2164", "title": "The Continual Pre-Training Dilemma", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you continually pre-train the 70B model on 100B medical tokens without catastrophic forgetting or general benchmark regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2165", "title": "The Warp Size Thread Count", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many threads actually execute per scheduling unit on an NVIDIA GPU, and why does launching 100 threads per block waste resources?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2166", "title": "The Memory Coalescing Penalty", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does Kernel A run significantly faster than Kernel B on the H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Kernel A is compute-bound, while Kernel B is memory-bound due to different arithmetic intensities.", "Kernel B causes warp divergence, forcing threads to execute sequentially instead of in parallel.", "Kernel A enables memory coalescing where a warp fetches a single 128-byte cache line, whereas Kernel B's strided access wastes up to 97% of fetched cache line bandwidth.", "Kernel B writes to read-only memory, causing L2 cache invalidations on every memory transaction."], "correct_index": 2}}, {"id": "cloud-2171", "title": "The Flash Attention Tiling Insight", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Flash Attention compute exact softmax without materializing the full attention matrix, and what memory hierarchy level does it exploit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2172", "title": "The Kernel Launch Overhead Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you eliminate kernel launch overhead, and what are the tradeoffs of each approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2173", "title": "The Occupancy vs IPC Tradeoff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the lower-occupancy 64x64-tile GEMM run 2x faster, and when should you intentionally sacrifice occupancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2174", "title": "The GPU Memory Hierarchy Bandwidth Stack", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is skipping shared memory and using L2 directly sound for this memory-bound H100 kernel, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2175", "title": "The Dataflow Stationary Strategy", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which dataflow minimizes total data movement for this specific GEMV shape, and how does the answer change when M increases to 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2176", "title": "The CUDA Streams Overlap Failure", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did the three CUDA streams serialize, and what synchronization design enables proper overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2178", "title": "The Custom Fused Kernel Design", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tiling strategy, work partitioning, HBM traffic reduction, and primary risk would you choose for this fused Triton kernel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2179", "title": "The Thread Block Scheduling Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does increasing the H100 grid from 264 to 265 blocks raise kernel time from 1.0 ms to 1.5 ms, and how should grids be sized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2180", "title": "The Ring AllReduce Bandwidth Cost", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much total data does each GPU send and receive during one AllReduce operation?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 0}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2181", "title": "The Collective Primitive Confusion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the semantic differences between AllGather, ReduceScatter, and AllReduce, and why does FSDP use them at different points?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 0}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2182", "title": "The Alpha-Beta Communication Model", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the alpha-beta model, what is the Ring AllReduce time for 200 MB across 32 GPUs, and when does latency become negligible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2183", "title": "The Ring vs Tree Algorithm Selection", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does NCCL use Tree AllReduce below 256 KB and Ring AllReduce for larger buffers on the 64-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2184", "title": "The Gradient Bucket Fusion Tradeoff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Who is right, and what is the real tradeoff being managed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2185", "title": "The Overlap Efficiency Ceiling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Where did the remaining 10 ms of execution time come from if communication is fully overlapped?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 10 ms is overhead from SM contention, memory bandwidth contention, and CUDA synchronization introduced by overlapping communication and computation.", "The AllReduce operation requires exactly 10 ms of pure CPU processing before it can be offloaded to the NCCL backend.", "The overlap completely failed to execute, so the system fell back to sequential execution, and the 10 ms is the time taken to check for the failure.", "Communication-computation overlap requires an additional data copy to system RAM, which takes exactly 10 ms."], "correct_index": 0}}, {"id": "cloud-2186", "title": "The Hierarchical AllReduce Asymmetry", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is flat Ring AllReduce slow for 2 GB gradients across 256 GPUs, and what topology-aware algorithm would you use instead?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 3}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2188", "title": "The Gradient Compression Convergence Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does naive Top-K gradient sparsification hurt convergence, and what mechanism is needed to make it work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2189", "title": "The NCCL Collective Deadlock", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did this cause a deadlock, and what invariant did the model code violate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2190", "title": "The Bandwidth-Optimal Lower Bound", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the AllReduce bandwidth lower bound, does Ring achieve it, and can any large-message algorithm beat Ring on a fully connected topology?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2192", "title": "The In-Network Reduction Promise", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the networking team's claim that SHARP will cut AllReduce time in half valid, and when does SHARP help most or least?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2194", "title": "The 100K-GPU Collective Breakdown", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What collective communication architecture would you design for 100,000 GPUs to handle DP AllReduce, FSDP sharding, and MoE AllToAll?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2195", "title": "The Roofline Diagnostic", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is this kernel compute-bound or memory-bound, and what is its theoretical attainable performance according to the roofline model?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 0}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2196", "title": "The Micro-Benchmark Mirage", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why did a 2.5x attention kernel micro-benchmark speedup yield only an 8% end-to-end Llama-70B throughput gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2197", "title": "The Statistically Invalid Benchmark", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you determine whether this improvement is statistically significant, and what is the minimum number of runs needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2198", "title": "The Benchmark Gaming Red Flags", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What benchmark gaming techniques should you suspect, and how does MLPerf's compliance framework attempt to prevent them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The only red flag is the 128K batch size, which exceeds the memory capacity of the A100. MLPerf prevents this by requiring all submissions to use a strict 4K batch size regardless of cluster scale.", "Three gaming red flags: accuracy cliff-hanging, unrealistic 128K batch size, and hyperparameter overfitting with a custom LR schedule. MLPerf's compliance framework attempts to prevent this by enforcing reference architectures, optimizers, and restricting hyperparameter tuning in the Closed division.", "The main red flag is training to exactly 75.9% accuracy. MLPerf rules require training to at least 80% accuracy for ResNet-50 to ensure the model has converged properly before stopping the timer.", "There are no red flags; 128K batch sizes with custom LR schedules are standard practice for large clusters. MLPerf encourages these optimizations to push the boundaries of hardware utilization."], "correct_index": 1}}, {"id": "cloud-2199", "title": "The Right Profiler for the Job", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "In what order do you use these tools, and what does each one tell you that the others cannot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2200", "title": "The Power Measurement Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What went wrong with the power measurement methodology?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The measurement estimated 350 kW, but thermal mass smoothing reduces the actual required cooling to 200 kW.", "The measurement accurately captured 350 kW of GPU power, requiring 350 kW of cooling.", "The measurement missed sustained prefill bursts and system overheads, severely underestimating the true peak load of 735 kW.", "The measurement incorrectly included PUE, meaning the actual required capacity is only 250 kW."], "correct_index": 2}}, {"id": "cloud-2201", "title": "The Nsys Timeline Mystery", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the three most likely causes of this gap, and how do you distinguish between them using the nsys timeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2202", "title": "The MFU vs HFU Confusion", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the distinction between MFU and HFU matter, and which metric should you trust for comparing training configurations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["MFU and HFU are identical metrics; 62% is valid regardless.", "62% is likely HFU; true MFU ≈ 47% (accounting for ~33% recompute). MFU normalizes out implementation choices.", "62% is likely MFU; true HFU ≈ 82% (adding 33% recompute to 62%).", "62% is likely nvidia-smi utilization; true MFU is >80%."], "correct_index": 1}}, {"id": "cloud-2203", "title": "The Conservation of Bottlenecks", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does Conservation of Complexity explain the FFN fusion's 12% step-time reduction and AllReduce becoming the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2204", "title": "The Full-Stack Performance Mystery", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you debug a 64-GPU job with healthy GEMMs, NCCL, and data pipeline but only 35% MFU instead of the expected 55%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Generate an nsys timeline and look for serialization, micro-bubbles between kernel launches, and straggler GPUs causing idle time before AllReduce barriers.", "Switch from a Ring AllReduce to a Tree AllReduce topology to immediately resolve the 20% MFU gap.", "Focus exclusively on the data loader; if it is not stalling, the only other possibility is that the CPU is overheating.", "Increase the batch size by 4x to force the GPU to become compute-bound, ignoring the network communication overhead."], "correct_index": 0}}, {"id": "cloud-2205", "title": "The MLPerf Division Dilemma", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which MLPerf Inference division should the VP prioritize, Open or Closed, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Open division, because 3x faster INT4 quantization proves the hardware's superior raw compute capability.", "Open division, because hyperscalers only care about the absolute lowest latency (3x faster).", "Closed division, because it constrains models and allows direct hardware comparisons for procurement.", "Closed division, because custom model architectures highlight specific tensor core advantages."], "correct_index": 2}}, {"id": "cloud-2206", "title": "The Energy Roofline", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which config has higher tokens per joule, and how would an energy roofline show whether the workload is compute-, bandwidth-, or power-limited?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 3}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2207", "title": "Designing a Benchmark Suite for LLM Inference", "topic": "compound-ai-systems", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What limitations does the current benchmark have, and how would you design a suite that tests real-world production dimensions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2208", "title": "The Silent Performance Regression", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design an observability and benchmarking system to detect, attribute, and prevent a gradual 15% LLM P50 latency regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2209", "title": "The Dominant Resource Fairness Intuition", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does this naive split waste cluster resources, and what is the core idea behind Dominant Resource Fairness (DRF)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Naive scheduling equally divides all resources, leaving 40 GPUs and 380 CPUs stranded because each team hits their other resource limit first. DRF equalizes the allocated fraction of each team's bottleneck resource (CPU for A, GPU for B), allocating 16 jobs to A and 20 to B for ~100% cluster utilization.", "Naive scheduling over-allocates GPUs to Team A. DRF ensures that both teams receive an equal number of GPUs, regardless of their CPU requests, ensuring strict GPU fairness.", "Naive scheduling leads to CPU starvation for Team B. DRF resolves this by statically splitting the cluster into CPU-heavy and GPU-heavy partitions, achieving 50% utilization.", "Naive scheduling works well but requires manual tuning. DRF dynamically scales the CPU frequency based on GPU utilization, eliminating stranded resources."], "correct_index": 0}}, {"id": "cloud-2210", "title": "The MIG Partitioning Tradeoff", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the fundamental tradeoff MIG introduces, and why can you not simply get 7x the jobs from each GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2211", "title": "The Backfill Scheduling Gap", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does backfill scheduling work here, and how much of the 256 idle GPU-hours does it recover?", "chain_ids": ["cloud-chain-auto-021-07"], "chain_positions": {"cloud-chain-auto-021-07": 0}, "chain_tiers": {"cloud-chain-auto-021-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2212", "title": "The Rail-Optimized Placement Problem", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did the placement cause the AllReduce to be 3x slower, and what scheduling constraint would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2213", "title": "The Heterogeneous Fleet Capacity Plan", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you plan capacity for the new serving tier without buying new hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2214", "title": "The Deadline-Aware Scheduling Inversion", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling principle did SJF violate when handling the 50 evaluation jobs, and how should the scheduler be redesigned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["SJF optimizes for maximum cluster utilization, which inherently delays short jobs. The scheduler should use First-In-First-Out (FIFO) to guarantee fairness.", "SJF optimizes for average completion time but ignores job deadlines and slack. The scheduler should be redesigned to use Earliest Deadline First (EDF) or a slack-aware policy so urgent jobs are prioritized.", "SJF failed because it was unaware of the GPUs' memory capacity, scheduling too many evaluation jobs on the same node. The scheduler should be redesigned to be memory-aware.", "SJF correctly optimized the workload, and the deadline miss is an unavoidable consequence of bursty traffic. No scheduler redesign is needed; the cluster just needs more GPUs."], "correct_index": 1}}, {"id": "cloud-2216", "title": "The NUMA-Unaware Inference Regression", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the Kubernetes scheduler missing that explains the 40% latency increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2217", "title": "The Multi-Tenant Noisy Neighbor", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What shared resource is the training job contending for?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2218", "title": "The Opportunistic Training Checkpoint Race", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you redesign the preemption protocol to guarantee the 60-second SLA while minimizing lost training work?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 1}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2219", "title": "The Power Density Ceiling", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does the scheduler need to account for power as a schedulable resource, and what is the maximum number of GPUs you can realistically schedule at peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2220", "title": "The Heterogeneous Workload Placement Matrix", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which workload goes on which GPU type to maximize fleet-wide cost-efficiency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2221", "title": "The GPU Fragmentation Death Spiral", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is causing the GPU fragmentation death spiral, and what multi-level fixes would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2222", "title": "The Multi-Tenant Starvation Cascade", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What scheduling policy resolves Team A's borrowed GPUs, Team B's queued 256-GPU job, and Team C's Black Friday burst without new hardware?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 3}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2223", "title": "The Liquid-Cooled Scheduling Frontier", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you schedule the job to maximize speed, and can both GPU types be used synchronously?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2224", "title": "The Storage Tier Latency Gap", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many orders of magnitude separate HBM access latency from Lustre, and why does that gap matter for ML training?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 0}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2225", "title": "The Lustre Stripe Throughput", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What aggregate read throughput can the 50-OST-striped file achieve, and what happens if it is stored on only 1 OST?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2226", "title": "The Checkpoint Bandwidth Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does a synchronous checkpoint take, and what fraction of training time is lost if you checkpoint every 30 minutes?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 0}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2227", "title": "The NVMe Warm Cache Miss", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the training throughput drop by 40% during the first epoch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2228", "title": "The Object Store Training Anti-Pattern", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the root cause of the 500 samples/sec bottleneck, and how should they restructure the data?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 1}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2229", "title": "The Write Amplification Tax", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How is a 2 TB checkpoint causing more than 2 TB of physical writes to the SSD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash SSDs cannot overwrite data in place; they must erase entire blocks. This forces the flash translation layer to read, erase, and rewrite valid pages along with new data, multiplying the physical writes (write amplification).", "The operating system caches the checkpoint in RAM and continuously flushes it to the SSD, resulting in duplicate physical writes for the same file due to fsync operations.", "PyTorch and other frameworks write checkpoints in small 4KB chunks, which misaligns with the NVMe physical page size and forces the drive to store duplicate metadata.", "The parallel filesystem driver automatically replicates the 2 TB checkpoint across multiple SSDs within the local node to ensure high availability."], "correct_index": 0}}, {"id": "cloud-2230", "title": "The Metadata Storm", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does this 'metadata storm' happen, and how do you fix it without upgrading the MDS hardware?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 1}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2231", "title": "The Checkpoint Tiering Strategy", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a tiered checkpoint strategy for the 100K-GPU run with 18 TB checkpoints that keeps GPU idle time under 10 seconds?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 3}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2232", "title": "The I/O Wall at Scale", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 35% drop in per-GPU throughput when scaling from 256 to 4,096 GPUs, given underutilized network and compute?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 2}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The shared Lustre filesystem is hitting an I/O wall; at 4,096 GPUs the aggregate read demand exceeds 80% of the 500 GB/s capacity, causing non-linear latency spikes that stall the data loader.", "The InfiniBand network is bottlenecked by the AllReduce communication of 4,096 GPUs, causing the training step to block on network transfers.", "The GPUs are thermal throttling due to the increased density of the 4,096 GPU cluster, reducing their compute utilization.", "The data pipeline requires more CPU workers to decode the images, as the CPU compute cannot keep up with the GPU demand."], "correct_index": 0}}, {"id": "cloud-2233", "title": "The Erasure Coding Overhead", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does erasure coding hurt write performance, and when is the tradeoff worth it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2234", "title": "The Data Gravity Problem", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What bottleneck caused the 200 TB S3-backed training job to slip from 40 to 55 hours, and what storage architecture fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2235", "title": "The Checkpoint Cascade Failure", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What caused this cascade, and how do you prevent it?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 2}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2236", "title": "The I/O Jitter Amplification", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a 500 ms storage hiccup on one or two workers slow all 16,000 GPUs by 450 ms, and how would you fix it?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 3}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The hiccup only affects the local GPU; the overall 450ms slowdown is an illusion. Fix by ignoring it.", "Synchronous AllReduce forces all GPUs to wait for the slowest worker; fix by using asynchronous parameter servers.", "The Lustre hiccup causes an interrupt storm across the InfiniBand fabric. Fix by disabling RoCE.", "Synchronous data parallelism forces a barrier where all GPUs wait for the straggler's delayed compute. Fix by increasing prefetch depth and sharding data across more OSTs."], "correct_index": 3}}, {"id": "cloud-2237", "title": "The 100K-GPU Storage Architecture", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What storage hierarchy meets the 100K-GPU cluster's 20 TB/s reads, 20 TB checkpoints, failure tolerance, and $15M budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2238", "title": "The Storage Disaggregation Dilemma", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Under what conditions is disaggregated storage the right choice, and when does converged storage win?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 4}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Chinchilla laws only apply to transformer language models; they cannot be used for cluster design decisions because different model architectures have different scaling behaviors", "Compute-optimal training means using the largest possible cluster for the shortest time; Chinchilla laws imply that time-to-completion, not cost, should be the primary optimization target", "The Chinchilla scaling laws prescribe N ∝ D (parameters proportional to tokens) for compute-optimal training. The key insight for cluster design: (1) A 10× larger model requires 10× more data and 100× more compute. (2) This means GPU-hours scale as C = 6ND ∝ N², so doubling model size quadruples training cost. (3) Network bandwidth must scale with model size (larger AllReduce), not data size. (4) Storage must scale linearly with data size for preprocessing throughput.", "Chinchilla scaling shows that data size is irrelevant — only parameter count determines final model quality; cluster design should therefore focus exclusively on compute FLOPS, not storage or I/O bandwidth"], "correct_index": 2}}, {"id": "cloud-2239", "title": "BSP vs Streaming Dataflow for Feature Computation", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is a pure streaming approach problematic for the historical batch features, and what execution model mismatch is at play?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2240", "title": "The Shuffle Data Volume Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much data moves across the network during the shuffle phase, and what determines whether this is a broadcast join or a shuffle join?", "chain_ids": ["cloud-chain-auto-003-12"], "chain_positions": {"cloud-chain-auto-003-12": 0}, "chain_tiers": {"cloud-chain-auto-003-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2242", "title": "The Petabyte Deduplication Shuffle Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does this approach fail at petabyte scale, and what is the standard distributed deduplication pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2243", "title": "Actor Model vs BSP for Data Processing Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Under what conditions would the actor-model approach outperform BSP for data processing, and when would it be strictly worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2244", "title": "The Missing Combiner in Distributed Aggregation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is 18 TB moving across the network to produce a 2 GB result, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2245", "title": "Bootstrapping a Quality Classifier for Trillion-Token Corpora", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bias, and how do you build a more robust quality scoring pipeline at this scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The bias is over-sampling short documents. Chunk into 512-token segments.", "The bias is language imbalance. Translate all data to English before filtering.", "The bias favors massive SEO spam. Enforce strict maximum length thresholds.", "The bias is domain-quality conflation. Use multi-signal heuristics and stratified sampling."], "correct_index": 3}}, {"id": "cloud-2246", "title": "The Curriculum Data Mixing Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you determine the optimal mixture, and what systems constraint makes this problem harder than it appears?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2247", "title": "The Streaming Deduplication State Explosion", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the Flink checkpoint time to grow to 25 minutes, and what architecture handles unbounded deduplication state growth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2248", "title": "The Distributed Join Strategy Selection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which join strategy should you use for the 5 GB, 200 GB, and 2 TB lookup joins, and what is the total network I/O?", "chain_ids": ["cloud-chain-auto-003-12"], "chain_positions": {"cloud-chain-auto-003-12": 2}, "chain_tiers": {"cloud-chain-auto-003-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use Broadcast join for all three tables since Spark can spill broadcast variables to disk. Total network I/O is 2.5 TB.", "Use Sort-Merge join for all three tables to avoid memory errors. Total network I/O is 50 TB + 5 GB + 200 GB + 2 TB =~ 52.2 TB.", "Use Broadcast join for the 5 GB table, and Sort-Merge or Shuffle Hash join for the 200 GB and 2 TB tables. Total network I/O is roughly 2.5 TB (broadcast) + 50.2 TB (shuffle 2) + 52 TB (shuffle 3) = 104.7 TB.", "Use Shuffle Hash join for the 5 GB and 200 GB tables, and Sort-Merge join for the 2 TB table. Total network I/O is 154 TB."], "correct_index": 2}}, {"id": "cloud-2249", "title": "Designing a Multi-Signal Data Quality Pipeline at Petabyte Scale", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a 100 TB/week quality-scoring pipeline that supports cheap, rapid signal-weight ablations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2250", "title": "The Curriculum Ordering vs Random Shuffling Tradeoff", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Under what conditions does curriculum ordering beat random shuffling, and what are the systems costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2251", "title": "The Connected Components Problem in Fuzzy Deduplication", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you efficiently resolve this at the scale of 800M edges over 5B nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2252", "title": "Designing the End-to-End Pre-Training Data Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the full reproducible pipeline for the 2T-token corpus, justify the technology choices, and test the $50K/month budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2253", "title": "The Deterministic Global Shuffle at Trillion-Token Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you implement a deterministic, coordinator-free global shuffle for 2M shards and 2,048 GPUs, and handle elastic GPU-count changes?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 3}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2254", "title": "The SSA Form Purpose", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What property of SSA makes it essential for compiler optimization, and what would go wrong without it?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 0}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2255", "title": "The Dialect Hierarchy Lowering", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does an ML compiler need multiple IR levels instead of compiling directly from the computation graph to machine code?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 0}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2256", "title": "The Tiling Factor Search Space", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How large is the search space for a full model with 80 unique kernels, and why does this make exhaustive search impractical?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 1}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2257", "title": "The Triton vs CUDA Tradeoff", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If hand-tuned CUDA reaches 85% of H100 peak, what peak-FLOPS fraction should Triton achieve, and why is there a gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2258", "title": "The Lowering Pass Fusion Loss", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why would SwiGLU break the fusion pass, and at which IR level does the failure occur?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 1}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2259", "title": "The Graph Break Recompilation Storm", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is causing the recompilation storm, and how do you diagnose which operations are triggering graph breaks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2260", "title": "The Cost Model vs Profiling Dilemma", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the analytical cost model break on the new hardware, and how would you fix it for future GPU generations?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 2}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100 has a smaller L2 cache than the A100, causing the analytical model to overestimate cache hits. The fix is to hardcode the new cache sizes.", "The H100 changed microarchitectural features (like TMA and larger L2 cache) that the analytical model ignored. The scalable fix is to use a learned cost model calibrated with profiling data.", "The A100 cost model relies on Tensor Cores, which were replaced by CUDA cores in the H100. The fix is to switch entirely to random search for all compilations.", "The auto-tuner was profiling kernels using CPU memory instead of GPU HBM. The fix is to ensure all tensors are moved to device memory before tracing."], "correct_index": 1}}, {"id": "cloud-2261", "title": "The XLA vs TorchInductor Decision", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the architectural differences between XLA and TorchInductor, and when does each have a structural advantage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2262", "title": "The Tensor Core Codegen Gap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is codegen falling back to FFMA instead of HMMA, and where in the compiler should you fix it?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 2}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2263", "title": "The Auto-Tuning Transfer Problem", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does tuning for batch_size=32, seq_len=2048 fail on dynamic shapes, and how would you cover the shape space without exhaustive tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Auto-tuning fails because it overfits to the weight distribution of the model. To cover the shape space, you should apply regularization during the tuning phase.", "B) Auto-tuning fails because the learning rate is statically compiled. To cover the shape space, dynamically schedule learning rates based on batch size.", "C) Auto-tuning fails because exhaustive search is too computationally expensive to run online. To cover the shape space, you must tune all possible combinations offline and store them.", "D) Auto-tuning fails because optimal tile sizes depend on the ratio of compute to memory traffic, which shifts with shape. To cover the space, partition it into buckets by arithmetic intensity, tune a representative shape per bucket, and dispatch dynamically."], "correct_index": 3}}, {"id": "cloud-2264", "title": "The Custom Op Compiler Co-Design", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why can't the compiler generate efficient code for GLA, and how would you close the gap with FlashAttention-2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2265", "title": "The MLIR Retargetability Boundary", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Where does MLIR retargetability break when moving to MI300X, and what extra optimization work is required?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 3}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2266", "title": "The Compilation Latency Wall in Serving", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate kernel caching, AOT compilation, and background compilation against the 5-second cold-start SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2267", "title": "The New Accelerator Compiler Stack", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the MLIR compiler stack for this message-passing PE accelerator, including dialects, lowering passes, and auto-tuning?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 4}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2270", "title": "The cuDNN Convolution Dispatch", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does cuDNN select different algorithms for different layers, and what is the primary factor driving this selection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2272", "title": "The Framework Dispatch Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What specifically causes the 65% CPU-side idle time during inference, and how can you fix it?", "chain_ids": ["cloud-chain-auto-005-19"], "chain_positions": {"cloud-chain-auto-005-19": 0}, "chain_tiers": {"cloud-chain-auto-005-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2273", "title": "The Library vs Custom Kernel Decision", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you evaluate the build-vs-wait decision for writing a custom Triton kernel versus waiting for vendor library support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2274", "title": "The Caching Allocator Fragmentation Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does the allocation fail, and what is the role of PyTorch's caching allocator in this failure?", "chain_ids": ["cloud-chain-auto-005-19"], "chain_positions": {"cloud-chain-auto-005-19": 2}, "chain_tiers": {"cloud-chain-auto-005-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2275", "title": "The torch.compile Graph Break Cascade", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is a graph break, why did these patterns cause 47 of them, and how do you eliminate them to recover the speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2276", "title": "The Autograd Overhead in Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where is the extra 14GB coming from, and how do you reclaim it?", "chain_ids": ["cloud-chain-auto-005-19"], "chain_positions": {"cloud-chain-auto-005-19": 1}, "chain_tiers": {"cloud-chain-auto-005-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2277", "title": "The Runtime Selection for Multi-Model Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which inference runtime should you choose for each of the five models, and why can't one runtime serve all five optimally?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2278", "title": "The CUDA Graphs vs Dynamic Shapes Tradeoff", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you preserve most CUDA Graph benefits while handling variable output lengths and continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2279", "title": "High-Fanout Item Feature Retrieval", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the 250ms item feature fetch and bring end-to-end P99 under the 150ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reduce the candidate pool from 1,000 to 200 items to cut down fetch times by 80%, reducing item fetch to 50ms and total latency to 120ms.", "Implement batched retrievals (MGET) to group the 1,000 item IDs into 5 parallel batches, and add an in-memory L1 cache to achieve an 80% hit rate, dropping fetch latency to ~2ms.", "Upgrade the Redis-backed feature store to faster NVMe SSDs to reduce the 2.0ms database processing time per item down to 0.1ms.", "Increase the number of concurrent threads from 10 to 100 so each thread only makes 10 sequential calls, reducing fetch latency to 25ms."], "correct_index": 1}}, {"id": "cloud-2280", "title": "The Compression Pipeline Ordering Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Does it matter whether we prune first and then quantize, or quantize first and then prune?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2282", "title": "The Activation Sparsity Mirage in MoE", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does offloading inactive MoE experts to CPU DRAM cause 5x P99 latency in continuous batching at 200 QPS?", "chain_ids": ["cloud-chain-auto-001-20"], "chain_positions": {"cloud-chain-auto-001-20": 0}, "chain_tiers": {"cloud-chain-auto-001-20": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2283", "title": "The Structured Pruning Recovery Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you determine if more data will close the gap or if you pruned too much, and how do you systematically set the ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2285", "title": "The Full-Stack Compression Audit", "topic": "extreme-quantization", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is wrong with the current compression pipeline, and how would you redesign it from first principles?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 3}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2287", "title": "The Fairness Impossibility Tradeoff", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why are demographic parity and equalized odds impossible to satisfy together except in trivial cases?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2288", "title": "The Fairness Monitoring Compute Budget", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 100M DAU and 8 demographic groups, what hourly volume, 30-day audit storage, and compute cost does fairness monitoring require?", "chain_ids": ["cloud-chain-auto-secondary-014-32"], "chain_positions": {"cloud-chain-auto-secondary-014-32": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2290", "title": "The Intersectional Subgroup Explosion", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does naive intersectional subgroup enumeration fail, and what computational and statistical challenges must the audit handle?", "chain_ids": ["cloud-chain-auto-secondary-014-32"], "chain_positions": {"cloud-chain-auto-secondary-014-32": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Naive enumeration is perfectly fine because 100K samples guarantee at least 800 samples per subgroup, providing sufficient statistical power across all 120 intersections.", "The 120 subgroups can be evaluated efficiently using Bonferroni correction without any loss of statistical power or need for more data.", "It fails because enumerating 120 subgroups creates severe statistical underpowering for minority intersections and guarantees false positives due to multiple comparisons without correction.", "The computational overhead of calculating equalized odds 120 times will cause the evaluation pipeline to exceed the maximum runtime of standard CI/CD runners."], "correct_index": 2}}, {"id": "cloud-2291", "title": "The EU AI Act Compliance Pipeline", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What infrastructure is needed for EU AI Act compliance, and which requirement is most expensive to retrofit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2293", "title": "The End-to-End Fairness Infrastructure", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the Responsible AI infrastructure, specifying key components, data flows, storage requirements, and the hardest systems challenge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2294", "title": "Real-Time Aggregates Latency Tradeoff", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do the on-the-fly and streaming designs compare for the 30-day z-score, and what architecture would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2295", "title": "Fraud Detection Feature Latency Bottleneck", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you reduce P99 latency from 180ms to below the 100ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2296", "title": "High-Fanout Item Feature Retrieval", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the 250ms item feature fetch and bring end-to-end P99 under the 150ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reduce the candidate pool from 1,000 to 200 items to cut down fetch times by 80%, reducing item fetch to 50ms and total latency to 120ms.", "Implement batched retrievals (MGET) to group the 1,000 item IDs into 5 parallel batches, and add an in-memory L1 cache to achieve an 80% hit rate, dropping fetch latency to ~2ms.", "Upgrade the Redis-backed feature store to faster NVMe SSDs to reduce the 2.0ms database processing time per item down to 0.1ms.", "Increase the number of concurrent threads from 10 to 100 so each thread only makes 10 sequential calls, reducing fetch latency to 25ms."], "correct_index": 1}}, {"id": "cloud-2297", "title": "DLRM Iteration Loop Optimization", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the experiment pipeline to get iteration time under 10 hours without exceeding the current 720 A100-hours per week budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2298", "title": "RAG Pipeline Evaluation Architecture", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the RAG experiment workflow to cut average iteration time below 1 hour and cost below $50 per run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2299", "title": "TPU MXU Padding and Memory Stalls", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is MXU utilization only 14% on the TPU v4 Pod, and what changes would you make to improve throughput?", "chain_ids": ["cloud-chain-auto-secondary-015-11"], "chain_positions": {"cloud-chain-auto-secondary-015-11": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2300", "title": "Hopper FP8 MoE Latency Regression", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did FP8 make MoE decoding slower, and how would you fix the routing and GEMM kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Hopper does not support FP8 for MoE because expert routing requires FP16 precision.", "FP8 runs at double clock speed, causing thermal throttling.", "Skewed token counts cause unaligned FP8 GEMMs and high amax overhead; fix is padding and fusion.", "Memory bandwidth for FP8 scaling factors exceeds savings."], "correct_index": 2}}, {"id": "cloud-2301", "title": "Diagnosing Gradient Checkpointing Bypasses", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 1GB-per-layer memory growth despite checkpointing, and how much memory is being retained?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2302", "title": "MoE Expert Parallelism Decode Bottleneck", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the MoE model failing to achieve expected performance, and how do you redesign the deployment?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 3}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2303", "title": "GQA Prefill Latency Regression", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the prefill regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2306", "title": "The Silent String-to-Hash Collision", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What could cause prediction variance to collapse to 0.05 despite all schema fields being present and non-null?", "chain_ids": ["cloud-chain-auto-secondary-015-27"], "chain_positions": {"cloud-chain-auto-secondary-015-27": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2307", "title": "The Diluted Regional Distribution Shift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why are SEA users seeing hallucinated translations when the global KL drift metric remains below threshold?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 3}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2308", "title": "GPU Starvation from Cloud Storage", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing low GPU utilization and pegged CPU during ViT training from S3, and how would you fix the input pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2309", "title": "Cross-Region Training Latency", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are training time and egress costs exploding with the 50TB dataset in us-east-1 and H100s in us-west-2, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2310", "title": "Data Skew OOMs in Preprocessing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do only a few Spark executors OOM during the user-item join, and how would you change the join to fix it?", "chain_ids": ["cloud-chain-auto-003-12"], "chain_positions": {"cloud-chain-auto-003-12": 3}, "chain_tiers": {"cloud-chain-auto-003-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2311", "title": "Distributed Storage Metadata Thrashing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ResNet-50 throughput collapse when scaling from 16 to 128 GPUs on NFS, and how would you redesign storage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2312", "title": "Periodic Latency Spikes in Feature Store", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 5-minute 3000ms latency spikes in the Kafka-Flink-Redis pipeline, and how would you eliminate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2313", "title": "Auto-Tuning Budget vs Dynamic Batching", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you meet the 5ms latency SLA and the 180-minute compilation budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2314", "title": "Evaluating MLIR Fusion for Bandwidth", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "You propose an MLIR lowering pass to fuse these element-wise operations. What is the expected speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2315", "title": "FSDP AllGather PFC Storms at 4K Scale", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the FSDP AllGather tail latency at 4,096 GPUs, and how would you fix the RoCEv2 network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2316", "title": "Optimizing FP8 Formats for LLM Training", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should you map E4M3 and E5M2 across activations, gradients, and weight updates to avoid FP8 training divergence?", "chain_ids": ["cloud-chain-auto-014-11"], "chain_positions": {"cloud-chain-auto-014-11": 0}, "chain_tiers": {"cloud-chain-auto-014-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2317", "title": "Attention Softmax FP16 Overflow", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 70B model produce NaNs beyond 16k context, and what precision change fixes attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2318", "title": "INT8 KV Cache Per-Tensor Quantization Failure", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does per-tensor INT8 KV cache quantization break the 13B LLM, and what quantization scheme would preserve quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2319", "title": "Mixed Precision Optimizer State Underflow", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does casting Adam states and master weights to FP16 cause the 10B model's loss to plateau after 5,000 steps?", "chain_ids": ["cloud-chain-auto-008-16"], "chain_positions": {"cloud-chain-auto-008-16": 2}, "chain_tiers": {"cloud-chain-auto-008-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2320", "title": "FP8 Delayed Scaling Out-of-Bounds in Production", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does delayed scaling with FP8 E4M3 produce NaNs when prompts shift abruptly, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2321", "title": "RoPE Embedding Degradation in BF16", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do retrieval tasks fail beyond position 60,000 with BF16 long-context inference, and how would you fix RoPE computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2322", "title": "Mitigating W8A8 Activation Outliers via Mathematical Migration", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you re-architect the quantization pipeline to preserve accuracy without dropping to 16-bit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2323", "title": "MoE Router Logit Overflow in FP16", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do all tokens suddenly route uniformly in the FP16 MoE, and what precision change should be made to the router?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2324", "title": "BF16 Accumulation Precision Loss in Massive GEMMs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does BF16 accumulation in the 16384 x 16384 projection hurt validation accuracy, and what accumulation precision should be used?", "chain_ids": ["cloud-chain-auto-014-13"], "chain_positions": {"cloud-chain-auto-014-13": 1}, "chain_tiers": {"cloud-chain-auto-014-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2325", "title": "INT4 KV Cache Group Size Architecture", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does per-channel quantization fail for KV caches, and how does grouped quantization fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2326", "title": "Dynamic Loss Scaling in Mixed Precision", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does a static loss scale of 65536 make the FP16 ViT loss NaN on step 1, and what scaling strategy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2327", "title": "Large Vocabulary Cross-Entropy Overflow", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP16 language-model head and cross-entropy cause NaN loss with a 256,000-token vocabulary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2328", "title": "Activation-Aware Weight Quantization (AWQ)", "topic": "extreme-quantization", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does AWQ recover zero-shot accuracy after RTN INT4 fails, without changing the 4-bit data type or inference kernel?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2329", "title": "FP8 KV Cache Asymmetry: K vs V Degradation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP8 E4M3 work for the V cache but cause a 5% MMLU drop for the K cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2330", "title": "MoE Router Overhead in FP8 Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you eliminate the 15ms/token router casting penalty while preventing FP8 E4M3 router overflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2331", "title": "MoE Interconnect Bottleneck on TPU Pods", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is MFU stuck at 15% for the 1.2T MoE on TPU v5e, and how would you redesign MoE routing and parallelism?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 3}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2332", "title": "The FP8 Speedup Illusion in LLM Decoding", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP8 only improve batch-1 decoding from 22 to 24 tokens/sec on the H100, and what would increase throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2333", "title": "Systolic Array Tile Padding Collapse", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does throughput collapse when the batch size changes from 256 to 257 on the 256x256 NPU, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-11"], "chain_positions": {"cloud-chain-auto-secondary-015-11": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2334", "title": "FlashAttention SRAM Bank Conflicts", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing the 12.5% SRAM bandwidth utilization in this FlashAttention kernel, and how would you fix it?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 4}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The SRAM is too small (128MB), causing the kernel to thrash to HBM. Fix it by increasing the SRAM size or reducing the sequence length.", "The head dimension of 128 creates severe SRAM bank conflicts because it is a multiple of the bank stride, serializing parallel load requests. Fix it by padding the inner dimension in shared memory to skew consecutive column accesses across different banks.", "The compute SMs are stalled because FlashAttention is compute-bound by the softmax operation. Fix it by using an approximation of softmax to reduce FLOPs.", "The custom accelerator's clock speed is too low for a sequence length of 4096. Fix it by pipelining the matrix multiplications to hide latency."], "correct_index": 1}}, {"id": "cloud-2335", "title": "Dataflow Compiler Thrashing on Dynamic Shapes", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the static dataflow accelerator show 850ms p99 and 90% idle time for variable-length speech inputs, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2336", "title": "FPGA Routing Congestion and Clock Collapse", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why did adding the 256x256 MLP layer drop FPGA fMAX from 250MHz to 110MHz, and what RTL/HLS change would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2337", "title": "3D Convolution L2 Cache Thrashing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes the 94% L2 miss rate and 8x HBM read amplification in the 256x256x256 3D CNN, and how would you change the layout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2338", "title": "Die-to-Die Interconnect Bottleneck in Chiplet ASICs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the chiplet accelerator run GeLU at 800 TFLOPS but the reduction-heavy GEMM at 120 TFLOPS, and how should GEMM be partitioned?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 3}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2339", "title": "HBM Thermal Throttling on Zipfian Embeddings", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does throughput fall as HBM stack 0 reaches 95°C while total HBM bandwidth is only 18%, and how would you distribute the load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The total memory bandwidth is saturated by cache evictions, causing the entire HBM to overheat. The load should be distributed by increasing the TPU core clock.", "Highly skewed Zipfian accesses to a few popular embeddings located on Stack 0 cause a localized physical thermal hotspot, triggering hardware throttling despite low overall bandwidth. The load should be distributed by replicating these hot embeddings across all 4 HBM stacks.", "The model has a memory leak that specifically targets HBM Stack 0, causing it to overheat as it stores redundant data. The load should be distributed by flushing the memory periodically.", "The 18% HBM bandwidth utilization is a false metric; the TPU actually processes 1200 GB/s internally, causing Stack 0 to overheat. The load should be distributed by using larger batch sizes."], "correct_index": 1}}, {"id": "cloud-2340", "title": "INT4 Weight-Only Quantization Slowdown on A100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT4 weight-only quantization make the 65B model slower despite shrinking weights, and what quantization/deployment fix would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU lacks native INT4 Tensor Cores, so ALUs must unpack INT4 to FP16, creating an 80ms compute bottleneck; fix by using INT8 natively or pre-packing weights.", "The memory bandwidth drops to 17.5ms, but unaligned accesses add 78ms of penalty; fix by using FP16.", "CPU-to-GPU transfer of the 35GB INT4 weights is bottlenecked by the PCIe Gen4 bus.", "INT4 quantization causes massive activation scale outliers that force a fallback to FP32 kernels."], "correct_index": 0}}, {"id": "cloud-2341", "title": "XLA Compiler Host OOM on Long Sequence Attention", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does compiling 128k FlashAttention OOM the 512GB host CPU despite <2GB device memory, and how would you avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2342", "title": "ZeRO-3 Host Offload PCIe Contention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are the 8 GPUs only 12% utilized with ZeRO-3 CPU offload despite idle NVLink, and what architecture changes would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2343", "title": "PCIe Switch Oversubscription in KV Paging", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do KV-cache host-to-GPU transfers spike above 600ms with 16 concurrent requests, and how should the allocator/scheduler account for PCIe topology?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 1}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2344", "title": "Hybrid Interconnect MoE Routing", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you map the 2GB-per-GPU MoE All-to-All onto the NVLink-pair plus PCIe-switch topology to reduce the 60% throughput loss?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2345", "title": "Bypassing CPU for Massive Checkpoints", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why do asynchronous 80GB-per-GPU checkpoints stall for 45s through the host CPU, and what data path should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVMe filesystem has insufficient IOPS to handle 640 GB of concurrent writes, requiring an upgrade to a parallel filesystem like Lustre.", "The CPU memory bus and PCIe pathways become saturated because the checkpoint data is bounced through host RAM, which can be bypassed using GPUDirect Storage (GDS).", "The PyTorch training loop is Single-Thread-Bound, taking 45.7 seconds (640GB / 14GB/s) to serialize the tensors.", "The dual 200Gbps NICs are bottlenecking the transfer; upgrading to 400Gbps NICs will eliminate the 45-second stall by providing 100 GB/s."], "correct_index": 1}}, {"id": "cloud-2346", "title": "CXL vs HBM in Embedding Tables", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will adding a CXL 2.0 Type 3 memory module let the 1.5TB embedding service reach 40M lookups/sec, and what should you do instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2347", "title": "RoCEv2 Incast in 3D Parallelism", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the >60ms tail-latency spikes when 64 GPUs send 50MB to one target, and how would you mitigate this RoCEv2 incast?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2348", "title": "NUMA Boundaries in High-Throughput DMA", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is PCIe Rx capped at 11GB/s per GPU with one CPU thread saturated, and how would you fix the host transfer path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2349", "title": "Multi-Tenant Serving: LoRA vs KD", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy should you choose for 100 customized 8B LLM tenants—100 distilled 1B models or one 8B base with LoRA adapters—and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2350", "title": "The Auto-Scaler Fragmentation Deadlock", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are 50 strict 64-GPU jobs stuck despite 4,400 idle GPUs, and why does adding 50 nodes per day not fix the queue?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 3}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2351", "title": "The Multi-AZ AllReduce Cost Explosion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did adding 128 nodes in AZ-B cut throughput by 40% and create a massive cross-AZ bill, and how should scheduling change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The new AZ-B nodes have defective networking hardware that drops packets, causing the 40% throughput loss and $170k/week egress cost due to TCP retransmissions. The scheduler should isolate these nodes.", "A topology-agnostic scheduler allowed synchronous Ring AllReduce to span across AZ-A and AZ-B. The slow inter-AZ link bottlenecked the GPUs, and the massive gradient transfers incurred the ~$170k/week egress bill. The scheduler must enforce locality-aware, single-AZ placement.", "The training loop is using FP8 quantization which falls back to CPU computation when spanning multiple AZs, causing the throughput drop and high egress costs. The scheduler should disable FP8 for cross-AZ jobs.", "The 128 new nodes in AZ-B are reading training data from an object store in AZ-A. The cross-AZ data loading is starving the GPUs of batches and generating the $170k/week bill. The scheduler should replicate the data to AZ-B."], "correct_index": 1}}, {"id": "cloud-2352", "title": "Asynchronous Checkpoint Storage Tiering", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect storage and checkpoint data flow for the 16,384-GPU job to sustain 1.6TB/s reads and keep checkpoint pauses under 100ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2353", "title": "Petabyte-Scale Multimodal Streaming", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design ingestion to sustain 2 TB/s throughput without hitting API rate limits, excessive costs, or cache-warming penalties?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2354", "title": "Diagnosing PCIe NUMA Mismatch", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do GPUs 4-7 train at 700 images/s while GPUs 0-3 hit 1500 images/s, and how should the PyTorch dataloader be pinned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2355", "title": "Root Causing NVLink P2P Fallback", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are TP=4 AllGather ops capped near 24GB/s instead of 300GB/s NVLink on the 4-GPU node, and how would you restore P2P?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2356", "title": "Diagnosing RoCEv2 PFC Storms in MoE", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 3-5s network collapses and NCCL watchdog crashes during MoE All-to-All on RoCEv2, and how would you mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2358", "title": "Diagnosing Pageable Memory Bounce Buffers", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 200MB host-to-GPU feature transfers randomly spike from 8ms to 45ms, and what memory allocation pattern would prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe bus is congested by other background tasks. Fix this by upgrading to PCIe Gen5 or reducing the batch size.", "The CUDA driver must dynamically allocate a hidden pinned 'bounce buffer' and perform a host-to-host copy before DMA transfer. Fix this by pre-allocating a persistent ring buffer of pinned memory using `cudaMallocHost`.", "The host CPU is dynamically generating features too slowly, causing GPU starvation. Fix this by moving feature generation to the GPU.", "The `cudaMemcpyAsync` API inherently has high tail latency due to CPU-GPU synchronization overhead. Fix this by switching to synchronous `cudaMemcpy`."], "correct_index": 1}}, {"id": "cloud-2359", "title": "Root Causing Containerized IPC Fallback", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving the 8x GPU DDP job into Kubernetes raise iteration time from 400ms to 1200ms, and how would you fix the NCCL fallback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The container is throttling CPU requests, slowing down the dataloader. Fix this by increasing CPU limits in the Pod spec.", "The container restricts PCIe bandwidth to the GPU. Fix this by adding the `nvidia.com/gpu` resource limit.", "The container restricts the `/dev/shm` shared memory size to 64MB, causing NCCL to fall back from NVLink. Fix this by mounting a larger `tmpfs` volume to `/dev/shm`.", "The Kubernetes network plugin is routing intra-node traffic through the pod overlay network. Fix this by setting `hostNetwork: true`."], "correct_index": 2}}, {"id": "cloud-2360", "title": "Diagnosing PCIe Contention from Checkpointing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does iteration time spike from 2.5s to 18.5s every 1000 steps during async 80GB checkpointing, and how would you reduce the stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2361", "title": "Root Causing RDMA DCQCN Slow Start", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is 70B TP-over-RDMA inference stuck at 45ms/token with <10% link utilization and no drops, and how would you tune the network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2362", "title": "Diagnosing CXL Cache Thrashing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why do 1M random lookups on the 1TB CXL memory table take 120ms while bandwidth is only 2GB/s, and how would you speed them up?", "chain_ids": ["cloud-chain-auto-008-11"], "chain_positions": {"cloud-chain-auto-008-11": 2}, "chain_tiers": {"cloud-chain-auto-008-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CXL 2.0 module is defective because it is only achieving 2GB/s instead of 32GB/s. The hardware should be RMAd and replaced with a module that has working PCIe lanes.", "The PCIe Gen5 bus is congested by other background traffic, artificially capping the CXL module to 2GB/s. The application should use PCIe Quality of Service (QoS) to prioritize embedding fetches.", "Random sparse lookups cause severe CPU cache thrashing. Because CXL latency is ~250ns, serialized fetches limit throughput. The solution is to use software-managed batching and prefetching to overlap latency and saturate the bandwidth.", "Embedding parallelism is unnecessary because embeddings can be stored on CPU and looked up via PCIe; the sparse lookup pattern means only a few KB per batch needs to transfer from CPU to GPU."], "correct_index": 2}}, {"id": "cloud-2363", "title": "Root Causing H2D Serialization in CUDA Streams", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the GPU idle ~34% of the time despite using cudaMemcpyAsync, and how would you overlap transfer and compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2364", "title": "Diagnosing Multi-Tenant RDMA QoS Interference", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 10ms RDMA fraud-inference requests spike to 150ms during a TCP Hadoop shuffle, and what network QoS configuration would prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2365", "title": "Fat-Tree Oversubscription Tradeoff", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you accept the $15M savings from a 2:1 oversubscribed 400Gbps fat-tree, and what is the expected All-Reduce latency penalty?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 3}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2366", "title": "Dragonfly Topology for MoE Routing", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 32,000-GPU MoE training, should you choose Dragonfly over a Fat-Tree for 60% fewer links, and what latency risks arise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2367", "title": "ECMP vs Adaptive Routing for Elephant Flows", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you replace static ECMP with packet-level adaptive routing to push RoCEv2 utilization above 60%, and what hardware tradeoff does it create?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 3}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2368", "title": "RoCEv2 vs InfiniBand Congestion Control", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 8,192 GPUs with frequent 64-to-1 incast, how do InfiniBand and RoCEv2 trade off congestion handling, deadlock risk, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2369", "title": "Rail-Optimized vs Standard Leaf-Spine", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which 8-NIC leaf-spine layout should you choose—same-ToR or rail-optimized—and how do they trade All-Reduce performance against fault tolerance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2370", "title": "RDMA vs RPC for Disaggregated KV-Cache", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For transferring a 15GB KV-cache between prefill and decode nodes, how do gRPC over TCP and GPUDirect RDMA trade latency, CPU load, and complexity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2371", "title": "AEC vs AOC for 800G Leaf Links", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 24,000 800Gbps 1.5-2.5m NIC-to-leaf links, should you use AECs or AOCs, and what are the power tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2372", "title": "PFC Deadlock Recovery Latency Tradeoff", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you choose between 100us and 50ms PFC deadlock timeouts, and what are the performance and stability tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2373", "title": "Scale-Up vs Scale-Out for 256 GPUs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which 256-GPU architecture should you choose for 8-way TP and 32-way PP, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2374", "title": "FEC Latency in Distributed Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you disable KP4 FEC to save 110 ns per hop, or keep it enabled for the 15-hop token path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2375", "title": "3D Torus Bisection Bandwidth Limitation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you run the 4,096-GPU All-Reduce workload on the 3D Torus or migrate to a non-blocking Fat-Tree, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2376", "title": "RDMA Memory Registration Overhead", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the GNN pipeline use dynamic RDMA memory registration or a pre-pinned 100GB GPU pool, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2377", "title": "Cloud-Assisted Wake Word Pipeline", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you split wake-word and intent processing between the MCU and the cloud to stay under the 20mW budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2379", "title": "OTA Delta Updates for Wearables", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect the cloud-to-wearable update pipeline for a 200KB model over 50 kbps BLE to minimize power and SRAM usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2380", "title": "Tiered Inference for Video Doorbells", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you tier PIR, MCU person detection, and cloud Face ID so the 5000mAh doorbell lasts 6 months?", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 3}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2382", "title": "Federated Personalization of Wake Words", "topic": "federated-learning", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you personalize the 50KB keyword model on 256KB earbuds without sending audio or exceeding SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2383", "title": "Edge-Cloud Traffic Camera Bandwidth Optimization", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you split perception and tracking between 5,000 LTE cameras with edge NPUs and the cloud to reduce data cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2384", "title": "Privacy-Preserving Health Wearable FL", "topic": "federated-learning", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect federated sleep-stage training for 1M smart rings without draining the 20mAh battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Upload raw PPG and Temp data continuously over BLE to a phone gateway for centralized cloud training, avoiding on-device compute.", "Use cross-device Federated Learning with edge feature extraction. The ring extracts low-dimensional features, performs local SGD while charging, and syncs lightweight model updates via BLE to the phone.", "Stream the raw 50Hz data directly to the cloud via the ring's built-in LTE connection, bypassing the phone entirely to save BLE power.", "Send the full raw data over BLE in a single batch once a week, buffering the 60 MB of data in the ring's 128KB SRAM."], "correct_index": 1}}, {"id": "cloud-2385", "title": "High-Throughput Streaming Drift Detection", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign drift monitoring for 20,000 QPS of 256-dimensional embeddings without OOMing the sidecar?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2386", "title": "Cost-Aware Shadow Testing Architecture", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the two-tower model without paying for a 100% 7-day shadow deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2387", "title": "Resolving Feature-Model Desynchronization", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you prevent the V2 model rollout from causing Redis cache-miss storms on user_affinity_v2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2389", "title": "Mitigating Cold Start Cascading Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you redesign autoscaling so 60GB model weights don't make new pods crash-loop during the 3,000 QPS spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2391", "title": "Drift Detection with Extreme Label Delay", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you detect the new fraud ring before 45-day chargeback labels mature?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2392", "title": "Decoupling Storage in High-Frequency Deployments", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign model deployment so 40 daily updates don't push 15GB Docker images or slow scale-outs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2393", "title": "Production Debugging of Tensor Memory Leaks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you stop the PyTorch serving pods from OOMKilling every ~12 days while debugging the underlying memory leak?", "chain_ids": ["cloud-chain-auto-008-07"], "chain_positions": {"cloud-chain-auto-008-07": 2}, "chain_tiers": {"cloud-chain-auto-008-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2394", "title": "Eliminating Non-Determinism in ML CI/CD", "topic": "mlops-lifecycle", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the golden-logit GPU integration test flaky at 1e-6 tolerance, and how would you stabilize it?", "validated": true, "math_verified": true, "human_reviewed": {"status": "verified", "by": "expert", "date": "2026-04-28"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2395", "title": "Power-Capped Rack Density Tradeoff", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which rack configuration maximizes total tokens/sec under the 40kW limit, and what tradeoff does it introduce?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 5}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2396", "title": "Carbon-Aware Multi-Region Routing", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the weekly 10,000 GPU-hour job run in US-East or EU-North to minimize carbon, and what is the cost tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2397", "title": "Minimizing Serving Cost per Token", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which instance strategy minimizes TCO per million Mixtral tokens for the 10,000 tok/sec SLA, A100 or H100?", "chain_ids": ["cloud-chain-auto-secondary-009-16"], "chain_positions": {"cloud-chain-auto-secondary-009-16": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2398", "title": "Federated Learning Edge-Cloud Energy", "topic": "federated-learning", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do Configurations A and B compare on edge energy and cloud cost, and can either meet the 2% battery limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Config A drains 13.8% battery and costs $200; Config B drains 7.7% and costs $40. Neither meets the 2% battery limit without model compression.", "Config A meets the 2% battery limit because it only runs 1 local epoch, whereas Config B drains 7.7% battery due to the high compute cost of 10 epochs.", "Config B meets the 2% battery limit because it reduces communication rounds from 100 to 20, saving massive amounts of Tx/Rx energy.", "Both configurations meet the 2% battery limit, but Config B is preferred because it reduces cloud costs from $200 to $40."], "correct_index": 0}}, {"id": "cloud-2399", "title": "Datacenter Liquid Cooling TCO", "topic": "thermal-management", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 20MW datacenter, should you choose air or direct-to-chip liquid cooling over 3 years, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-43"], "chain_positions": {"cloud-chain-auto-secondary-015-43": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2400", "title": "API Defenses Against Model Extraction", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you defend the 70B LLM API against model extraction while keeping TTFT under 150ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2401", "title": "TEEs for Cloud Medical Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you deploy the 5GB ViT-L with TEEs for HIPAA data-in-use encryption while supporting 200 QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2402", "title": "High-Throughput DP-SGD Training", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you train the 500M-record embedding model with epsilon ≤ 1.5 in under 24 hours without OOMing A100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2403", "title": "Defending Against Streaming Poisoning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you defend the 30-minute fine-tuning pipeline against data poisoning in 50,000 phishing reports/hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2404", "title": "Secure ML Model Supply Chain", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you secure PyTorch model loading against Pickle RCE without exceeding the 15s P99 pod startup SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2405", "title": "Pipelined Guardrails in Compound Systems", "topic": "compound-ai-systems", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you orchestrate retrieval, generation, and the 300ms guardrail to meet the 1000ms P90 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Implement a Pipelined Guardrail architecture by chunking the LLM output every 50 tokens. The guardrail evaluates chunks asynchronously, overlapping with the generation of subsequent chunks, which reduces the critical path to 950ms and meets the SLA.", "Stream the LLM response directly to the client as it is generated, bypassing the guardrail for the first 100 tokens to ensure the Time-To-First-Token (TTFT) meets the 1000ms SLA.", "Reduce the LLM generation target from 150 tokens to 100 tokens to save 200ms, and run the 300ms guardrail sequentially at the end to achieve exactly 950ms.", "Deploy the Guardrail model onto the same GPU as the main LLM to eliminate network transfer latency, saving exactly 200ms to hit the 950ms target."], "correct_index": 0}}, {"id": "cloud-2406", "title": "Activation Outliers in Large Models", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the W8A8 degradation, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2407", "title": "FP8 MoE Routing Collapse", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the cause of this sudden routing degeneration, and how do you fix the precision configuration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2408", "title": "KV Cache Quantization Sinks", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes this long-context degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2409", "title": "QAT Gradient Oscillation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the mechanistic cause of this training instability during INT4 QAT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2410", "title": "Calibration Overfitting in GPTQ", "topic": "extreme-quantization", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did Java performance specifically degrade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2411", "title": "Unfused Dequantization Bottleneck", "topic": "extreme-quantization", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the architectural cause of this missing speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2412", "title": "Batch Size vs Quantization Pareto Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes this inversion in the performance Pareto curve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2413", "title": "TPU Systolic Array Underutilization", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is TPU v4 slower than A100 at batch size 1, and how would you co-design the serving stack to fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-11"], "chain_positions": {"cloud-chain-auto-secondary-015-11": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2414", "title": "MoE Routing Bandwidth Starvation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 128-expert MoE only 12% utilized, and what hardware-aware routing or placement change would restore compute efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2415", "title": "FPGA Spatial Pipeline Mapping", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did standard HLS miss the 10us SLA, and how would you map the 5-layer MLP onto the FPGA to meet it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2416", "title": "FP8 Quantization on Tensor Cores", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What FP8 formats should you use for weights and activations on H100, and why is E4M3 failing in attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2417", "title": "Custom ASIC SRAM Tiling", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the compiler map self-attention for sequence length 4096 onto the 16MB SRAM to avoid HBM spilling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2419", "title": "Unstructured vs Structured Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 60% unstructured pruning fail to speed up ResNet-50 on A100, and what sparsity pattern would actually accelerate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2420", "title": "Arithmetic Intensity & Cache Thrashing", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does co-locating the embedding model under MPS spike MLP latency, and should you use MPS or MIG?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2421", "title": "H100 FP8 Migration Throughput Cliff", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does moving from 4x A100 INT8 to 2x H100 FP8 barely increase QPS, and what must change in the KV cache path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2422", "title": "W8A8 PTQ Extreme Activation Outliers", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does naive W8A8 PTQ break the 130B model, and how would you handle the 0.1% activation outlier channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2423", "title": "Long-Context INT4 KV Cache Degradation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did INT4 KV cache quantization hurt needle-in-haystack accuracy, and how would you preserve long-context retrieval?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2424", "title": "CTR Embedding QAT Pipeline Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did FakeQuant QAT slow embedding training by 4x, and how would you redesign QAT to meet the daily retraining SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2425", "title": "Multi-Tenant LoRA Base Model Calibration", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do code LoRA adapters lose pass@1 after INT8 base quantization while chat is fine, and how should you calibrate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2426", "title": "W4A16 Batch-Size Latency Inversion", "topic": "extreme-quantization", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the GPTQ INT4 model faster at batch size 1 but slower than FP16 at batch size 128?", "chain_ids": ["cloud-chain-auto-secondary-011-24"], "chain_positions": {"cloud-chain-auto-secondary-011-24": 1}, "chain_tiers": {"cloud-chain-auto-secondary-011-24": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2427", "title": "FP8 Distributed Training Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did all-E4M3 FP8 training produce zero router gradients and NaNs, and which FP8 formats should be used for forward and backward?", "chain_ids": ["cloud-chain-auto-014-11"], "chain_positions": {"cloud-chain-auto-014-11": 1}, "chain_tiers": {"cloud-chain-auto-014-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2428", "title": "Diffusion Model INT8 PTQ Artifacts", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why does INT8 PTQ cause SDXL banding despite target-domain calibration, and how should activation scales be calibrated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2429", "title": "The FP8 Autoregressive Decoding Fallacy", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did decoding not scale directly with the 2x compute increase, and what explains the ~1.8x speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2430", "title": "The A10G vs V100 CNN Regression", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did MobileNetV3 inference get 50% slower on an A10G despite 92% higher FP16 TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2431", "title": "The DLRM Vertical Fusion Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did fusing embedding lookup with the first dense MLP layer double DLRM step latency on TPU v4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2432", "title": "The RoPE Recomputation Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does recomputing RoPE sin/cos on the fly make the kernel 4x faster despite 400% more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2433", "title": "The GQA Prefill Illusion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does GQA improve decoding throughput 3x but reduce end-to-end latency by only 2% for 50k-token prompts and 10 outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2434", "title": "The CSR Sparsity Performance Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing dense GEMMs with CSR SpMM make the 70%-sparse BERT-Large 5x slower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2435", "title": "The Over-Batched T4 Throughput Plateau", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does increasing BERT-base batch size from 32 to 256 flatline throughput while P99 latency spikes on T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2436", "title": "LLM Decoding Bandwidth Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is 100 tokens/sec at batch size 1 impossible for the 30B FP16 model on one A100 despite needing only 6 TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2437", "title": "The Long-Context Attention Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Would upgrading to H100 deliver a 3x speedup for 64k-token MHA prefill, or what should you change instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2438", "title": "The Quantization Throughput Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does quantizing the DLRM MLPs to INT8 leave throughput flat at about 5,100 QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2439", "title": "Graph Neural Network Hardware Selection", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which hardware should you choose for the GraphSAGE inference workload, A100 or TPU v4, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2440", "title": "Continuous Batching Arithmetic Shift", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Will continuous batching from batch size 2 to 16 raise per-token latency to 280ms, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2441", "title": "Tensor vs Pipeline Parallelism Roofline", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "At batch size 1, should the 70B model use PP=8 or TP=8 on the 8-GPU node, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2442", "title": "The MoE Memory Bandwidth Tax", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 8x40B MoE with the same active FLOPs as the 40B dense model drop from 500 to 120 tokens/sec?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 1}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2443", "title": "The Unstructured Pruning Illusion", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 50% unstructured pruning halve FLOPs but leave 14B batch-1 decoding latency stuck at 31ms on A10G?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2444", "title": "Batched Prefill Throughput Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does batching 16 simultaneous 2048-token prefill requests barely increase tokens/sec on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2445", "title": "FlashAttention Head Dimension Spillage", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does doubling the attention head dimension from 128 to 256 slow FlashAttention-2 training by 45% despite identical FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2446", "title": "The Activation Checkpointing Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does full activation checkpointing increase 30B fine-tuning step latency by 75% instead of the expected 33%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2447", "title": "MoE Routing at High Batch Size", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 16x10B top-1 MoE 30% slower than the dense 30B model at batch size 128 despite 3x fewer active FLOPs?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 2}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2448", "title": "The W16A8 Quantization Trap", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does W16A8 quantization fail to reduce batch-1 TPOT for the 70B model on H100, and what quantization would help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2449", "title": "The MQA Prefill Disappointment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does replacing MHA with MQA fail to reduce prefill latency for a single 16K-token prompt on the L4 GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2450", "title": "The Embedding Layer Bandwidth Illusion", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the embedding lookup limited to about 22.3 GB/s on H100 despite peak bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2451", "title": "Speculative Decoding Draft Model Sizing", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which draft model should you choose for speculative decoding, the 1.5B or 7B, to meet the 20ms TPOT budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2452", "title": "Prefill and Decode Cluster Disaggregation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Should you disaggregate prefill and decode over 100Gbps RoCE, and what latency tradeoff must you evaluate?", "chain_ids": ["cloud-chain-auto-006-02"], "chain_positions": {"cloud-chain-auto-006-02": 2}, "chain_tiers": {"cloud-chain-auto-006-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2453", "title": "Paged Attention Block Size Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a 256-token Paged Attention block size cause OOM at batch size 32, and what block-size tradeoff should you make?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 2}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2454", "title": "Chunked Prefill for Latency Jitter", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does chunked prefill reduce P99 TPOT spikes from the 4000-token prompt, and what tradeoff does it introduce?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 2}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2455", "title": "KV Cache PCIe Offloading", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design CPU KV-cache offload for 100K-token sessions, and what latency tradeoff must you manage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2456", "title": "Multi-LoRA Continuous Batching", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can you batch requests across 100 different LoRA adapters on one 13B base model without destroying GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2457", "title": "System-Prompt Prefix Caching", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the benefits and costs of using RadixAttention prefix caching for the shared 1500-token system prompt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2458", "title": "The Edge Fleet Compilation Dilemma", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you move edge model compilation to the cloud without 10-minute device downtime or 15% deployment failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2459", "title": "The NPU Fallback Latency Cliff", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does YOLOv8-nano jump to 185ms and 85% CPU utilization despite targeting the same edge NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2460", "title": "The Hybrid Cascade Bandwidth Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does end-to-end latency spike above 2500ms during the festival despite cloud GPU autoscaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2461", "title": "The Unfused Dequantization Tax", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the W4A16 INT4 model run at only 3 tokens/sec on the mobile GPU, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2462", "title": "The Cloud-to-Edge Calibration Shift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the INT8 model fail at night while the FP32 model works, and how would you fix the quantization pipeline?", "chain_ids": ["cloud-chain-auto-014-14"], "chain_positions": {"cloud-chain-auto-014-14": 0}, "chain_tiers": {"cloud-chain-auto-014-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2463", "title": "The NPU Context-Switching Tax", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the three-model edge pipeline 80ms instead of 30ms, and how would you restructure serving to improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2464", "title": "The Asymmetric Heterogeneous Offload", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is forcing the whole ViT onto the weaker mobile GPU faster than splitting MatMuls to the NPU and LayerNorm/Softmax to the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2465", "title": "Cloud-Edge Cascade for Retail Analytics", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect a cloud-edge cascade to minimize 3-year TCO while maintaining >95% recall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2466", "title": "Cost-Aware Hybrid LLM Routing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you route summarization between on-device Llama-3-8B and cloud models to cut cost while keeping latency under 2s for all users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2467", "title": "Optimizing Edge CI/CD Device Farm Costs", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the device-farm deployment pipeline to cut testing cost by 80% without letting NPU-crashing models ship?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2468", "title": "Cellular Cost Optimization via Edge LoRA", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign the daily POS model update system to reduce cellular data costs by more than 90% while preserving offline inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2469", "title": "Edge-Triggered Cloud Telemetry", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you selectively ingest rare pedestrian edge cases from 10,000 robots within a $500/day 5G budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2470", "title": "Hybrid ASR Compute Offloading", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you add on-device processing to cut smart-speaker cloud costs by at least 50% without increasing latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2471", "title": "On-Device Ad Ranking for Cloud Savings", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign ad ranking to cut cloud DLRM costs by 75%, keep latency under 100ms, and use local app history?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2472", "title": "Cloud Fallback Minimization via AOT Compilation", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you architect the deployment pipeline to cut cloud costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2473", "title": "The NPU Fallback Ping-Pong", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the structurally similar Model B perform 10x worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2474", "title": "The Hybrid Cascade False Positive Storm", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the cloud ingest bandwidth spike by 10x at dusk, crashing the gateway?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2475", "title": "The Edge LLM Memory Wall", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the limiting factor preventing the Edge NPU from achieving higher tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2476", "title": "The Asymmetric Quantization Penalty", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did switching from symmetric to asymmetric INT8 quantization raise edge latency from 15ms to 60ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2477", "title": "The Edge Gateway Batching Paradox", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does strict batch size 8 fail the 60ms safety SLA despite higher GPU throughput, and what batching policy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2478", "title": "The Self-Inflicted OTA DDoS", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the midnight OTA rollout of a 2GB model saturate the retail SD-WAN, and how would you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2479", "title": "The Thermal Throttling Ticking Clock", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the framerate drop to 12 fps after exactly 90 seconds of tracking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2480", "title": "The SRAM Spilling Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you modify the compiler to hit 5ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2481", "title": "Edge-Cloud Hybrid Video Break-Even", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which option has the lower 3-year TCO for 5,000 traffic cameras, and what accuracy/deployment tradeoffs must you evaluate?", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 2}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2482", "title": "Fleet-Wide Heterogeneous Deployment", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you redesign the mobile BERT deployment pipeline to reduce P99 latency and battery drain across 15 Android SoCs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2483", "title": "The NPU Fallback Latency Trap", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the ~330ms P99 latency caused by unsupported INT8 GeLU on the target NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2485", "title": "Depthwise Quantization Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does PTQ drop MobileNetV2 mAP from 42 to 18, and how would you preserve INT8 accuracy on the edge?", "chain_ids": ["cloud-chain-auto-014-14"], "chain_positions": {"cloud-chain-auto-014-14": 1}, "chain_tiers": {"cloud-chain-auto-014-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2486", "title": "Multi-Tenant Edge Accelerator Allocation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you schedule Model A and Model B on one Edge TPU to meet 30fps and 10fps requirements without 80ms latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2487", "title": "Bandwidth-Constrained OTA Pipelines", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the weekly OTA deployment pipeline to reduce cellular data costs by over 90% without degrading model performance?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2488", "title": "Cloud-Edge Federated Learning Memory Limits", "topic": "federated-learning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign federated learning for 5 million Cortex-M33 sensors so training fits in 256KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2489", "title": "Power-Aware Cloud-Edge Fallback", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you set the cloud-edge confidence thresholding so the wearable guarantees 14-day battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2490", "title": "Async FL for Intermittent Solar Devices", "topic": "federated-learning", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign FedAvg for 100,000 solar sensors that wake for only 5 seconds per hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2491", "title": "Cloud-Managed Federated QAT", "topic": "federated-learning", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you enable federated learning on 2 million INT8 smartwatches without exceeding 256KB SRAM or BLE bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2493", "title": "Cloud-Edge Sensor Fusion Bandwidth Minimization", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you partition predictive-maintenance inference so 10,000 Cortex-M4 sensors stay under the 50MB/s WiFi cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2494", "title": "Extreme Power-Budgeted Federated Edge", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the collar FL update policy to stay within the strict 36 Joules/day energy budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2495", "title": "Federated Learning Power and Comm Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How many FL rounds per day can each watch afford, and how should the cloud orchestrate participation under the 1% battery limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2496", "title": "Cloud NAS Hardware-in-the-Loop Optimization", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the NAS model draw 15mW despite fitting in SRAM, and how should the NAS pipeline change to meet the <5mW target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2497", "title": "Over-the-Air Weight Updates for IoT", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you update the 150KB INT8 model weekly over NB-IoT without violating the 5-year battery-life constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2498", "title": "Asynchronous FL with Edge Stragglers", "topic": "federated-learning", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What FL strategy should replace synchronous FedAvg when 85% of solar sensors drop during the 5-minute training window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2499", "title": "Edge-Cloud Partitioning for Acoustic Monitoring", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you balance local inference and cloud fallback for 1000 bird-call events/day to save energy while preserving rare-bird accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2500", "title": "Memory Bottleneck in Edge Personalization", "topic": "federated-learning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign on-device personalization so the smart ring can train without exceeding its 256KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2501", "title": "Cloud Drift Detection via Edge Proxies", "topic": "federated-learning", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you monitor concept drift across 1M vibration sensors over 10kbps LoRa without uploading raw 16kHz data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2502", "title": "Mixed-Precision Federated Aggregation", "topic": "federated-learning", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you orchestrate FL for INT8 Cortex-M0+ devices without FP32 support while avoiding INT8 training divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2503", "title": "Carbon-Aware Scheduling Tradeoffs", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you pause RLHF training for 6 high-carbon hours, or use another strategy to reduce carbon, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2504", "title": "LLM Serving Batch Size Energy", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you meet the 50ms TTFT SLA without destroying cost per token for Llama-3-70B serving?", "chain_ids": ["cloud-chain-auto-secondary-009-16"], "chain_positions": {"cloud-chain-auto-secondary-009-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2505", "title": "Overprovisioning Under Power Caps", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Under a 10MW cap, would you deploy 10,000 GPUs at 1000W or 14,000 GPUs power-capped to 700W, and what is the TCO trade-off?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 4}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2506", "title": "Liquid Cooling Retrofit ROI", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does the $8M D2C liquid-cooling retrofit pay off for a 5MW datacenter, and what operational factors affect the decision?", "chain_ids": ["cloud-chain-auto-secondary-015-43"], "chain_positions": {"cloud-chain-auto-secondary-015-43": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2507", "title": "Model Compression Carbon ROI", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "When does the INT8 distillation project break even on carbon versus continuing FP16 inference for 5M daily requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2508", "title": "Cloud vs Edge FL Energy Tradeoff", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is moving personalization training from 500 GPU-hours to FL across 500,000 smartphones greener, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2509", "title": "Rack-Level Thermal Colocation", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule memory-bound inference and compute-bound MatMul workloads to maximize 15kW rack utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2510", "title": "Cloud Through Silicon Vias L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can't traditional wire-bonding between stacked DRAM dies achieve HBM3's bandwidth density?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2511", "title": "HBM3 vs DDR5 Bandwidth Comparison", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many DDR5 channels would be required to match the total memory bandwidth of the HBM3 design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2512", "title": "Cloud Through Silicon Vias L5 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you evaluate the cost-performance trade-off between the 6-stack and 4-stack HBM3 accelerators for serving a 70B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2513", "title": "Cloud Top Of Rack Switch L2 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why do top-of-rack switches exist, and what failure-domain implications do they create for distributed training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2514", "title": "Cloud Top Of Rack Switch L3 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum cross-rack AllReduce bandwidth per GPU with 64 GPUs sharing a 400Gbps ToR uplink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2515", "title": "Cloud Top Of Rack Switch L5 0", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "After losing one 64-GPU rack, should you restart on 960 GPUs or wait 30 minutes for repair, and what is the break-even time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2516", "title": "TorchDynamo Bytecode-Level Capture vs. Operator-Level Tracing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does TorchDynamo's Python-bytecode-level capture matter, and how does it handle dynamic control flow?", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 0}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2517", "title": "TorchDynamo Speedup and Graph Break Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the speedup achieved, and estimate the potential speedup if graph breaks were eliminated so the entire model could be compiled.", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 1}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2518", "title": "Cloud Torchdynamo L5 0", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you spend 2 weeks eliminating graph breaks or buy 30% more GPUs for the 14-day, 64-A100 training run?", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 3}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2519", "title": "Cloud Train Serve Split L2 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the train-serve split architecture, and why does it work for deploying a cloud-trained 7B model on edge devices?", "chain_ids": ["cloud-chain-auto-001-01"], "chain_positions": {"cloud-chain-auto-001-01": 0}, "chain_tiers": {"cloud-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2520", "title": "Cloud Train Serve Split L3 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the estimated FP16 and INT8 inference memory footprints and throughputs for the 3B model?", "chain_ids": ["cloud-chain-auto-001-01"], "chain_positions": {"cloud-chain-auto-001-01": 1}, "chain_tiers": {"cloud-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2521", "title": "Cloud vs. Edge Inference Deployment Trade-offs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "When should you choose train-cloud/serve-edge over cloud-only inference despite the added deployment complexity?", "chain_ids": ["cloud-chain-auto-001-01"], "chain_positions": {"cloud-chain-auto-001-01": 2}, "chain_tiers": {"cloud-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2522", "title": "Cloud Transformation Lineage L2 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does transformation lineage prevent undocumented training-data pipeline changes from causing hard-to-debug regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2523", "title": "Cloud Transformation Lineage L3 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage and latency overhead would lineage metadata add to an 8-step pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2524", "title": "Cloud Transformation Lineage L5 0", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use dataset-level or record-level lineage for 100M records across 8 steps, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2525", "title": "Cloud Transient Failures L2 0", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What are transient failures, and why do they become routine in a 512-GPU training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2526", "title": "Cloud Transient Failures L3 0", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many transient failures should you expect over 30 days on 1024 GPUs, and what is the chance of zero failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2527", "title": "Cloud Transient Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compare redundant computation, gradient monitoring with rollback, and validation checks for transient-failure handling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2528", "title": "Cloud Transient Loads L2 0", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are transient power loads, and why is a 2MW, 500ms AllReduce spike dangerous for the datacenter?", "chain_ids": ["cloud-chain-auto-015-03"], "chain_positions": {"cloud-chain-auto-015-03": 0}, "chain_tiers": {"cloud-chain-auto-015-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2529", "title": "Cloud Transient Loads L3 0", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What transient power swing and energy buffer are needed when 4,096 H100s jump from 40% to 95% utilization?", "chain_ids": ["cloud-chain-auto-015-03"], "chain_positions": {"cloud-chain-auto-015-03": 1}, "chain_tiers": {"cloud-chain-auto-015-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2530", "title": "Cloud Transient Loads L5 0", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which transient-power mitigation—buffering, staggered AllReduce, or 80% power caps—would you choose, and why?", "chain_ids": ["cloud-chain-auto-015-03"], "chain_positions": {"cloud-chain-auto-015-03": 2}, "chain_tiers": {"cloud-chain-auto-015-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2531", "title": "Cloud Ultra Ethernet L2 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What makes Ultra Ethernet different from standard RoCE, and why does that matter for AI collective workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2532", "title": "Cloud Ultra Ethernet L3 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What effective bandwidth should you expect for 32 AllReduce flows over 4 paths with ECMP versus packet spraying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2533", "title": "Cloud Ultra Ethernet L5 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 2048-GPU, 60-day 175B GPT training run, would you choose InfiniBand NDR or Ultra Ethernet, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2534", "title": "Cloud Vector Operations L2 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why are GPUs faster than CPUs for neural network inference despite lower clock speeds, and what role do SIMD vector operations play?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2535", "title": "Cloud Vector Operations L3 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long would ReLU on a 10M-element tensor take on an AVX-512 CPU (3 GHz, 16-wide) vs. an H100 GPU (3.35 TB/s), and is it compute- or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2536", "title": "Diagnosing Branch Divergence in Custom Activation Kernels", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the custom if-else activation kernel 5x slower than ReLU on GPU, and what fixes would you evaluate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2537", "title": "Cloud Wafer Scale Engine L2 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a wafer-scale chip differ from a multi-GPU setup, and what communication advantage does it provide?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 0}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2538", "title": "Cloud Wafer Scale Engine L3 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does a 13B FP16 model fit in WSE-2's 40 GB SRAM, and how much faster is SRAM-speed access than HBM for decode?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 1}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2539", "title": "Cloud Wafer Scale Engine L5 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 10M daily 200-token requests on a 13B model, should the startup choose the GPUs or the WSE, and why?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 3}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2540", "title": "Cloud Warm Restart L2 0", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is warm restart preferable to cold-restarting the 256-GPU job after GPU #47 fails, and how does it work?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 0}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2541", "title": "Cloud Warm vs Cold Restart Downtime Analysis", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Over 14 days, how much downtime should the 1024-GPU job expect with cold restarts versus warm restarts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2542", "title": "Cloud Warm Restart L5 0", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 2048-GPU cluster, is maintaining 16 hot spares (0.8% overhead) worth the cost compared with cold restarts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2543", "title": "Cloud Warp Divergence L2 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is warp divergence, and why would routing tokens to different MoE experts within one warp cause a 3x slowdown?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2544", "title": "Cloud Warp Divergence L3 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 25% short-path tokens and 75% long-path tokens randomly assigned to 32-thread warps, what is the expected warp time and efficiency loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2545", "title": "Cloud Warp Divergence L5 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate sorting tokens, block-level branching, and padding as fixes for warp divergence in the sparse attention kernel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2546", "title": "Cloud Weight Stationary L2 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is weight-stationary dataflow, when is it advantageous, and how does it differ from output-stationary dataflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2547", "title": "Weight-Stationary Accelerator Memory Bandwidth for FP16 Matrix", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 2048x2048 FP16 matrix on 256 PEs with 2 KB SRAM each, how many weight tiles are needed and what is the weight traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2548", "title": "Cloud Weight Stationary L5 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which dataflow (weight-stationary or output-stationary) would you choose for 70B LLM batch-1 decode versus ResNet-50 batch-256 inference, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2549", "title": "Optimizing Checkpoint Frequency with the Young-Daly Formula", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why checkpoint every 30 minutes instead of every 5 minutes, and what trade-off does the Young-Daly formula optimize?", "chain_ids": ["cloud-chain-auto-004-01"], "chain_positions": {"cloud-chain-auto-004-01": 0}, "chain_tiers": {"cloud-chain-auto-004-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2550", "title": "Optimal Checkpoint Interval using the Young-Daly Formula", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using Young-Daly with 8-hour MTBF and 3-minute checkpoints, what is the optimal interval and expected training efficiency?", "chain_ids": ["cloud-chain-auto-004-01"], "chain_positions": {"cloud-chain-auto-004-01": 1}, "chain_tiers": {"cloud-chain-auto-004-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2551", "title": "Young-Daly Formula Optimization for Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which investment recovers more training efficiency, reducing checkpoint time to 1 minute for $50K, or doubling MTBF to 8 hours for $100K?", "chain_ids": ["cloud-chain-auto-004-01"], "chain_positions": {"cloud-chain-auto-004-01": 2}, "chain_tiers": {"cloud-chain-auto-004-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2552", "title": "Cloud Zero Copy Serialization L2 0", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would zero-copy serialization reduce the 8 ms JSON serialization cost and improve the 15 ms P99 latency?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 0}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2554", "title": "Cloud Zero Copy Serialization", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a latency-sensitive ML serving API, how would you choose among FlatBuffers, Protobuf, and JSON?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2555", "title": "Cloud Zero Optimizations L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does ZeRO reduce the memory waste of standard data parallelism, and what do Stages 1, 2, and 3 shard?", "chain_ids": ["cloud-chain-auto-008-03"], "chain_positions": {"cloud-chain-auto-008-03": 0}, "chain_tiers": {"cloud-chain-auto-008-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2556", "title": "Cloud Zero Optimizations L3 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 30B AdamW model on 8 GPUs, what is per-GPU memory under ZeRO Stages 1, 2, and 3, and which stage is the minimum needed to fit?", "chain_ids": ["cloud-chain-auto-008-03"], "chain_positions": {"cloud-chain-auto-008-03": 1}, "chain_tiers": {"cloud-chain-auto-008-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2557", "title": "ZeRO-2 vs ZeRO-3 Memory and Communication Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Given ZeRO-3 adds 50% communication but may enable 2x larger micro-batches, is it worth switching from ZeRO-2?", "chain_ids": ["cloud-chain-auto-008-03"], "chain_positions": {"cloud-chain-auto-008-03": 2}, "chain_tiers": {"cloud-chain-auto-008-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2558", "title": "Cloud Fully Sharded Data Parallel L2 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does FSDP differ from PyTorch DataParallel and DDP, and why does it enable training larger models?", "chain_ids": ["cloud-chain-auto-013-09"], "chain_positions": {"cloud-chain-auto-013-09": 0}, "chain_tiers": {"cloud-chain-auto-013-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2560", "title": "Cloud Fully Sharded Data Parallel L5 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 30B model on 64 GPUs, which strategy has the best communication efficiency: full FSDP, node-local FSDP plus DDP, or TP plus FSDP?", "chain_ids": ["cloud-chain-auto-013-09"], "chain_positions": {"cloud-chain-auto-013-09": 3}, "chain_tiers": {"cloud-chain-auto-013-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2561", "title": "Cloud Learning Rate Scheduling L2 0", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is learning rate scheduling necessary when a fixed 1e-3 LR causes LLM loss to plateau after 20% of training?", "chain_ids": ["cloud-chain-auto-secondary-016-02"], "chain_positions": {"cloud-chain-auto-secondary-016-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2562", "title": "Cloud Learning Rate Scheduling L3 0", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the linear scaling rule, what peak learning rate and warmup steps should you use if the original used LR=3e-4 with 1000 warmup steps?", "chain_ids": ["cloud-chain-auto-secondary-016-02"], "chain_positions": {"cloud-chain-auto-secondary-016-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2563", "title": "Cloud Learning Rate Scheduling L5 0", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 70% through a 30-day LLM pretraining run with flattened loss, should you restart, apply an LR warm restart, or continue?", "chain_ids": ["cloud-chain-auto-secondary-016-02"], "chain_positions": {"cloud-chain-auto-secondary-016-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-016-02": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2564", "title": "Cloud Gpu Virtualization L2 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How can GPU virtualization improve utilization when 20 models each use less than 10% of a modern GPU?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 0}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2565", "title": "Cloud Gpu Virtualization L3 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What MIG partition scheme should serve a 14 GB 7B model and a 2 GB 1B model on one A100?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 1}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2566", "title": "Cloud Gpu Virtualization L5 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For multi-tenant GPU sharing with workloads from 5% to 60% utilization, how do MIG, MPS, and Kubernetes time-slicing compare?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 3}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2567", "title": "Cloud Tokenization L2 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does the Chinese text tokenize into 350 tokens versus 100 for English, and how does that specifically affect memory and compute costs?", "chain_ids": ["cloud-chain-auto-003-13"], "chain_positions": {"cloud-chain-auto-003-13": 0}, "chain_tiers": {"cloud-chain-auto-003-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2568", "title": "Cloud Tokenization L3 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If a 7B LLM expands from a 32K to 128K vocabulary and cuts token count by 40%, what is the memory trade-off?", "chain_ids": ["cloud-chain-auto-003-13"], "chain_positions": {"cloud-chain-auto-003-13": 1}, "chain_tiers": {"cloud-chain-auto-003-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2569", "title": "Cloud Tokenization L5 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 50-language LLM, what are the systems trade-offs between one 256K BPE vocabulary and language-specific 32K tokenizers?", "chain_ids": ["cloud-chain-auto-003-13"], "chain_positions": {"cloud-chain-auto-003-13": 2}, "chain_tiers": {"cloud-chain-auto-003-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2570", "title": "Cloud Red Teaming L2 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does red teaming differ from standard model evaluation, and why is it necessary as a systems concern for a chatbot?", "chain_ids": ["cloud-chain-auto-001-08"], "chain_positions": {"cloud-chain-auto-001-08": 0}, "chain_tiers": {"cloud-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2571", "title": "Cloud Red Teaming L3 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long and how much would it cost to run 10,000 red-team prompts generating 50 tokens each at 40 tok/s on one GPU?", "chain_ids": ["cloud-chain-auto-001-08"], "chain_positions": {"cloud-chain-auto-001-08": 1}, "chain_tiers": {"cloud-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2572", "title": "Cloud Red Teaming L5 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a model serving 5M requests/day, how do human, automated LLM-based, and hybrid red teaming compare in coverage and cost?", "chain_ids": ["cloud-chain-auto-001-08"], "chain_positions": {"cloud-chain-auto-001-08": 2}, "chain_tiers": {"cloud-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2573", "title": "Cloud Token Budget L2 0", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the memory, compute, and cost implications of increasing the LLM API token budget from 4K to 32K tokens?", "chain_ids": ["cloud-chain-auto-011-06"], "chain_positions": {"cloud-chain-auto-011-06": 0}, "chain_tiers": {"cloud-chain-auto-011-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2574", "title": "Cloud Token Budget L3 0", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many concurrent requests can the H100 serve at 4K, 16K, and 64K token budgets?", "chain_ids": ["cloud-chain-auto-011-06"], "chain_positions": {"cloud-chain-auto-011-06": 1}, "chain_tiers": {"cloud-chain-auto-011-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2575", "title": "Cloud Token Budget L5 0", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use one shared GPU pool or separate pools for the 1K-token and 32K-token requests, and why?", "chain_ids": ["cloud-chain-auto-011-06"], "chain_positions": {"cloud-chain-auto-011-06": 2}, "chain_tiers": {"cloud-chain-auto-011-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2576", "title": "Cloud Deepspeed Zero L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are ZeRO-1, ZeRO-2, and ZeRO-3, and why might ZeRO-3 be overkill for a 7B model on 4 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2577", "title": "Cloud Deepspeed Zero L3 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the per-GPU memory footprint for ZeRO-1, ZeRO-2, and ZeRO-3, and which stage fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2578", "title": "Cloud Deepspeed Zero L5 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which is more communication-efficient for the 30B model: flat ZeRO-3 across 64 GPUs or hierarchical ZeRO, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2579", "title": "Cloud Megatron Parallelism L2 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does Megatron tensor parallelism split adjacent weight matrices, and why does the column-row pattern reduce communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2580", "title": "Cloud Megatron Parallelism L3 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the all-reduce communication volume and NVLink transfer time added by TP=8 per training step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2581", "title": "Cloud Megatron Parallelism L5 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which parallelism layout yields better throughput for the 70B model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2582", "title": "Cloud Qlora L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does QLoRA let a 65B model fit and fine-tune on a single 48 GB GPU?", "chain_ids": ["cloud-chain-auto-008-09"], "chain_positions": {"cloud-chain-auto-008-09": 0}, "chain_tiers": {"cloud-chain-auto-008-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2583", "title": "Cloud Qlora L3 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total QLoRA memory footprint for the 70B model, including adapters, gradients, optimizer states, and activations?", "chain_ids": ["cloud-chain-auto-008-09"], "chain_positions": {"cloud-chain-auto-008-09": 1}, "chain_tiers": {"cloud-chain-auto-008-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2584", "title": "Cloud Qlora L5 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which 70B fine-tuning option is best across quality, cost, and speed: QLoRA, full fine-tuning, or unquantized LoRA?", "chain_ids": ["cloud-chain-auto-008-09"], "chain_positions": {"cloud-chain-auto-008-09": 2}, "chain_tiers": {"cloud-chain-auto-008-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2585", "title": "Cloud Rlhf Infrastructure L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can PPO-style RLHF require up to 4x the GPU memory of standard fine-tuning for a 7B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2586", "title": "Cloud Rlhf Infrastructure L3 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory does PPO RLHF for the 13B model require, and how many 80 GB GPUs are minimally needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2587", "title": "Cloud Rlhf Infrastructure L5 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 70B RLHF, which setup should you choose among colocated PPO, separate generation/training clusters, and DPO, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2588", "title": "Cloud Inference Accelerator Selection L2 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What technical factors must you critically evaluate before adopting an ASIC claiming a 10x cost reduction over standard GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2589", "title": "Cloud Inference Accelerator Selection L3 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the cost per 1K tokens on A100 versus Inferentia2 at batch 1 and batch 64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2590", "title": "Cloud Inference Accelerator Selection L5 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate all-GPU, heterogeneous, and ASIC-first deployments for five diverse model architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2591", "title": "Cloud Benchmark Harness L2 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What must a proper ML benchmark harness control to avoid 20% run-to-run variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2592", "title": "Cloud Benchmark Harness", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bias does including the 5 warmup iterations introduce, and what latency should the benchmark report instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2593", "title": "Cloud Benchmark Harness L5 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the vendor speedup at batch 1 and sequence 128 valid for your batch 32, sequence 2048 workload, and how does compute bandwidth factor into the evaluation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2594", "title": "Cloud Benchmark Run Rules L2 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why do MLPerf run rules matter, and why is a custom learning rate schedule outside the rules a problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2595", "title": "Cloud Benchmark Run Rules L3 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Who wins the MLPerf ResNet-50 benchmark, Team A or Team B, and what is the cost difference if both were valid?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2596", "title": "Cloud Benchmark Run Rules L5 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you decide between a $500K MLPerf submission and publishing internal benchmarks for your different workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2598", "title": "Cloud Information Entropy L3 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical minimum bits per token for lossless compression (the Shannon entropy)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2599", "title": "Cloud Information Entropy L5 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you train on the deduplicated 7 TB or the full 10 TB, given 30% deduplication and 15% higher entropy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2600", "title": "Cloud Data Quality As Code L2 0", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How would data quality as code have caught the 15% label corruption before the 3-day training run started?", "chain_ids": ["cloud-chain-auto-secondary-015-26"], "chain_positions": {"cloud-chain-auto-secondary-015-26": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2601", "title": "Cloud Data Quality As Code L3 0", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What time and compute overhead should you expect for schema, null, distribution, and referential integrity checks on 500 GB daily?", "chain_ids": ["cloud-chain-auto-secondary-015-26"], "chain_positions": {"cloud-chain-auto-secondary-015-26": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2602", "title": "Cloud Data Quality As Code L5 0", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which data quality policy is best for a daily production training pipeline, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-26"], "chain_positions": {"cloud-chain-auto-secondary-015-26": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2604", "title": "Cloud Label Consensus L3 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the costs and expected quality improvements of adding 2 annotators per image versus expert-reviewing the 30% disagreements?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2607", "title": "AI-Assisted Labeling Throughput Gain", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much labeling cost does AI pre-annotation save on 500K images at 40 versus 120 images per hour and $25/hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2608", "title": "Labeling Strategy Selection Under Budget Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy maximizes labeled data volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2609", "title": "Arithmetic Intensity on the Energy Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does arithmetic intensity determine whether a workload's energy is dominated by FLOPs or data movement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2610", "title": "Computing Energy per Inference from Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total energy and data-movement fraction for 2M FLOPs and 64 KB of HBM traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2612", "title": "Federated Learning Communication Cost", "topic": "federated-learning", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much bandwidth is required per federated round and over 500 rounds for 100 devices training a 50M-parameter FP32 model?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 1}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2613", "title": "Federated vs Centralized Training Decision", "topic": "federated-learning", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach yields better model quality and why, given that the data is not privacy-sensitive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2614", "title": "TCO Drivers in Federated vs Centralized Training", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the key TCO differences between federated learning across 500 edge nodes and equivalent centralized cloud training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2615", "title": "Federated Learning Break-Even Analysis", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At what data volume does the federated approach become cheaper than centralized training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2616", "title": "Federated Learning ROI Under Regulatory Constraints", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which has better 2-year ROI?", "chain_ids": ["cloud-chain-auto-016-09"], "chain_positions": {"cloud-chain-auto-016-09": 3}, "chain_tiers": {"cloud-chain-auto-016-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2617", "title": "Gradient Inversion Attacks in Federated Learning", "topic": "federated-learning", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is 'raw data never leaves the device' insufficient for privacy, and what attack vector remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2619", "title": "Privacy-Accuracy Trade-off in Federated Deployment", "topic": "federated-learning", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which configuration should ship?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2620", "title": "Training Memory Footprint Breakdown", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does training require 4-8x more memory than inference, and what are the major memory consumers in a training step?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 0}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2621", "title": "Will This Model Fit in GPU Memory?", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much parameter-related memory is needed, and what is the minimum number of A100-80GB GPUs required ignoring activations?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 1}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2622", "title": "Memory Reduction Strategy Selection", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which memory reduction approach should be used for 7B fine-tuning on a single 80GB GPU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2623", "title": "Why Micro-Benchmarks Mislead at System Level", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can a GEMM micro-benchmark hit near-peak TFLOPS while end-to-end training only achieves 45% MFU?", "chain_ids": ["cloud-chain-auto-011-07"], "chain_positions": {"cloud-chain-auto-011-07": 0}, "chain_tiers": {"cloud-chain-auto-011-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2624", "title": "Isolating Memory Bandwidth via Micro-Benchmark", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 80GB total data transfer in 42ms, what bandwidth is achieved and how does it compare to the hardware's 2.0 TB/s peak?", "chain_ids": ["cloud-chain-auto-011-07"], "chain_positions": {"cloud-chain-auto-011-07": 1}, "chain_tiers": {"cloud-chain-auto-011-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2625", "title": "Diagnosing Performance with Micro vs Macro Benchmarks", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you blame the hardware or the software stack, and what micro-benchmarks would you run to diagnose?", "chain_ids": ["cloud-chain-auto-011-07"], "chain_positions": {"cloud-chain-auto-011-07": 2}, "chain_tiers": {"cloud-chain-auto-011-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2626", "title": "Why Time-to-Train Uses Target Accuracy Not Epochs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does MLPerf Training use time to reach target accuracy rather than time per epoch or time for N steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2627", "title": "Estimating Time-to-Train from Hardware Specs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 64 A100s at 45% MFU, how long will 3.2e18 FLOPs take to train?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2628", "title": "Evaluating Two Clusters for Training Competition", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which cluster wins on time-to-train, and by what factor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2629", "title": "Why Raw Agreement Percentage Misleads", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is 90% raw labeler agreement insufficient on its own, and what metric should be used to prove true inter-rater reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2631", "title": "Handling Low Agreement in Production Labeling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy is optimal: relabeling, label smoothing, or reducing the number of classes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2632", "title": "Why nn.Module Uses Parameter Registration", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does PyTorch's nn.Module require explicit nn.Parameter registration instead of treating all tensors as learnable?", "chain_ids": ["cloud-chain-auto-005-16"], "chain_positions": {"cloud-chain-auto-005-16": 0}, "chain_tiers": {"cloud-chain-auto-005-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2633", "title": "Counting Parameters in a Module Hierarchy", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many total parameters does this Embedding, 6-block Transformer, and final Linear model have including biases?", "chain_ids": ["cloud-chain-auto-005-16"], "chain_positions": {"cloud-chain-auto-005-16": 1}, "chain_tiers": {"cloud-chain-auto-005-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2634", "title": "Module Design for Serialization Robustness", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What breaks after Team B refactors the module class, and what serialization pattern should they use instead?", "chain_ids": ["cloud-chain-auto-005-16"], "chain_positions": {"cloud-chain-auto-005-16": 2}, "chain_tiers": {"cloud-chain-auto-005-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2635", "title": "Why model.eval() Matters for Inference", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why do predictions vary between identical inputs when a deployed model is left in training mode, and which layers cause it?", "chain_ids": ["cloud-chain-auto-005-17"], "chain_positions": {"cloud-chain-auto-005-17": 0}, "chain_tiers": {"cloud-chain-auto-005-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2636", "title": "BatchNorm Statistics Divergence", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What value does BatchNorm use for normalization in train mode vs eval mode?", "chain_ids": ["cloud-chain-auto-005-17"], "chain_positions": {"cloud-chain-auto-005-17": 1}, "chain_tiers": {"cloud-chain-auto-005-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2637", "title": "Diagnosing Train-Eval Performance Gap", "topic": "data-efficiency-selection", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With `model.eval()` correctly called, what is the likely cause of the 92% training to 78% deployment accuracy drop and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2638", "title": "Pure Functions Enable Composable Transforms", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does JAX require pure functions, and how does that enable transformations like vmap, jit, and grad?", "chain_ids": ["cloud-chain-auto-005-18"], "chain_positions": {"cloud-chain-auto-005-18": 0}, "chain_tiers": {"cloud-chain-auto-005-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2639", "title": "Vectorization Speedup via vmap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If vmap(grad(loss_fn)) vectorizes 256 per-sample gradients and the GPU has enough parallelism, what speedup and time should you expect?", "chain_ids": ["cloud-chain-auto-005-18"], "chain_positions": {"cloud-chain-auto-005-18": 1}, "chain_tiers": {"cloud-chain-auto-005-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2640", "title": "Choosing Between Stateful and Functional Paradigms", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the team switch to JAX for per-sample gradients, higher-order derivatives, and auto-vectorization, and what are the trade-offs?", "chain_ids": ["cloud-chain-auto-005-18"], "chain_positions": {"cloud-chain-auto-005-18": 2}, "chain_tiers": {"cloud-chain-auto-005-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2641", "title": "The Ridge Point and Hardware Balance", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the A100 and H100 hardware balance points, and what do they imply for workload optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2642", "title": "Determining Bottleneck from Hardware Balance", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is this compute-bound or memory-bound on an accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2644", "title": "The ML Test Score Framework", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does the ML Test Score measure, and why is it designed to assess technical debt rather than model accuracy?", "chain_ids": ["cloud-chain-auto-011-10"], "chain_positions": {"cloud-chain-auto-011-10": 0}, "chain_tiers": {"cloud-chain-auto-011-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2645", "title": "Scoring a Production ML System", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the team's ML Test Score, and how long will it take to reach a production-ready score of 20?", "chain_ids": ["cloud-chain-auto-011-10"], "chain_positions": {"cloud-chain-auto-011-10": 1}, "chain_tiers": {"cloud-chain-auto-011-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2646", "title": "Prioritizing ML Test Score Improvements", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which investment maximizes operational reliability and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2649", "title": "When Level 2 MLOps Is Premature", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the startup invest 6 months building Level 2 infrastructure?", "chain_ids": ["cloud-chain-auto-011-10"], "chain_positions": {"cloud-chain-auto-011-10": 2}, "chain_tiers": {"cloud-chain-auto-011-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2650", "title": "How Pipeline Jungles Form", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does an ML data pipeline evolve into a pipeline jungle, and why is that more dangerous than regular code complexity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2652", "title": "Pipeline Jungle Remediation Strategy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which remediation approach minimizes production risk while successfully detangling the rigid dependencies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2653", "title": "Why Glue Code Dominates ML Systems", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is only about 5% of many ML systems actual model code, and what makes up the remaining glue code?", "chain_ids": ["cloud-chain-auto-001-09"], "chain_positions": {"cloud-chain-auto-001-09": 0}, "chain_tiers": {"cloud-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2654", "title": "Quantifying Glue Code Maintenance Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many monthly maintenance hours come from the 500 lines of model code and 8,000 lines of glue code?", "chain_ids": ["cloud-chain-auto-001-09"], "chain_positions": {"cloud-chain-auto-001-09": 1}, "chain_tiers": {"cloud-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2655", "title": "Build Custom vs Use Off-the-Shelf ML Package", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should they do it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2656", "title": "How Undeclared Data Dependencies Cause Silent Failures", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can changing user_engagement_score from a 30-day to a 7-day rolling average cause a silent model failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2657", "title": "Detecting Undeclared Dependencies via Feature Statistics", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does this 0.04 shift trigger a 2-sigma alert, and how should you adjust for monitoring 50 features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2658", "title": "Preventing Undeclared Dependencies at Scale", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy best prevents silent failures at scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2659", "title": "Shadow Mode vs A/B Testing vs Canary Releases", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the three main online evaluation strategies (Shadow, Canary, A/B), when is each appropriate, and what risk does each mitigate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2661", "title": "Choosing Evaluation Strategy for a Safety-Critical Model", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which online evaluation strategy should you use for this safety-critical model, given the extreme cost asymmetry between false negatives and false positives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2662", "title": "How Request Pipelining Hides Latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For the 5ms/10ms/3ms stages, what throughput, latency, and GPU utilization do you get without pipelining versus with 3-stage pipelining?", "chain_ids": ["cloud-chain-auto-001-11"], "chain_positions": {"cloud-chain-auto-001-11": 0}, "chain_tiers": {"cloud-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2663", "title": "Pipeline Throughput with Unbalanced Stages", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 100 concurrent requests in the 2ms/3ms/15ms/1ms pipeline, what is the steady-state throughput and bottleneck?", "chain_ids": ["cloud-chain-auto-001-11"], "chain_positions": {"cloud-chain-auto-001-11": 1}, "chain_tiers": {"cloud-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2664", "title": "Pipelining vs Batching Trade-off", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which serving optimization approach should be chosen to meet the p99 SLA while maximizing throughput headroom?", "chain_ids": ["cloud-chain-auto-001-11"], "chain_positions": {"cloud-chain-auto-001-11": 2}, "chain_tiers": {"cloud-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2665", "title": "Why Tail Latency Matters at Scale", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the effective p99 latency the user experiences?", "chain_ids": ["cloud-chain-auto-001-12"], "chain_positions": {"cloud-chain-auto-001-12": 0}, "chain_tiers": {"cloud-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2666", "title": "Hedged Request Latency Improvement", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming independent latency distributions, what is the approximate new p99 latency and extra backend load?", "chain_ids": ["cloud-chain-auto-001-12"], "chain_positions": {"cloud-chain-auto-001-12": 1}, "chain_tiers": {"cloud-chain-auto-001-12": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2667", "title": "Tail Latency Mitigation Strategy Selection", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which approach provides the most cost-effective and operationally sound tail latency mitigation strategy?", "chain_ids": ["cloud-chain-auto-001-12"], "chain_positions": {"cloud-chain-auto-001-12": 2}, "chain_tiers": {"cloud-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2668", "title": "Model Caching in Multi-Model Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is model caching needed when 200 models share VRAM that holds only 20, and how does it differ from traditional web caching?", "chain_ids": ["cloud-chain-auto-001-10"], "chain_positions": {"cloud-chain-auto-001-10": 0}, "chain_tiers": {"cloud-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2669", "title": "Model Cache Hit Rate and Latency Impact", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 80GB VRAM and an LRU strategy proportional to traffic, how many models of each size fit, and what hit rate and average latency result?", "chain_ids": ["cloud-chain-auto-001-10"], "chain_positions": {"cloud-chain-auto-001-10": 1}, "chain_tiers": {"cloud-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2670", "title": "Model Caching Strategy Under Cost Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architecture minimizes cost while maintaining a strict p99 < 200ms SLA?", "chain_ids": ["cloud-chain-auto-001-10"], "chain_positions": {"cloud-chain-auto-001-10": 2}, "chain_tiers": {"cloud-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2671", "title": "Why Transformers Need Positional Encoding", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does a Transformer without positional encoding treat word-order permutations as equivalent, and why is this architecturally inevitable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2672", "title": "Positional Encoding Memory Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the memory cost of learned positional embeddings for 8192 positions and 4096 FP16 dimensions, and how does it compare to a 7B parameter model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2673", "title": "Choosing a Positional Encoding for Long-Context LLM", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which positional encoding approach (Absolute, RoPE+NTK, or ALiBi) should you choose for a 128K context LLM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2674", "title": "Receptive Field Growth in Deep CNNs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a CNN's receptive field grow with depth, and why does that matter for detecting objects of different sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2677", "title": "Compute Cost of Adding Network Depth", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much do inference time and parameter count increase when adding 5 identical 256-filter 3×3 layers to the 10-layer CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2678", "title": "Depth vs Width for Limited Compute Budget", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture achieves better accuracy under the 1 GFLOP constraint, and what structural components are necessary for it to train successfully?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2679", "title": "How Recommendation Systems Create Feedback Loops", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How can monthly retraining a recommender on its own engagement data create a runaway feedback loop, and why is it hard to detect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2680", "title": "Quantifying Feedback Loop Amplification", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After 3 retraining cycles with $\\pm$ 20% observation bias, what is the predicted crime amplification for the top-10 neighborhoods?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2681", "title": "Breaking a Feedback Loop in Production", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach should you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2682", "title": "Graceful Degradation in ML Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why do production ML systems need fallback strategies, and what is the typical fallback hierarchy from most to least sophisticated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2683", "title": "Fallback Availability Impact on SLA", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 99.5%, 99.9%, and 99.99% availability layers, what is overall system availability assuming independent failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2684", "title": "Why Backward Pass is 2x Forward", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does the backward pass require about twice the FLOPs of the forward pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2685", "title": "Estimating Total Training FLOPs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using 6ND, what are the total training FLOPs for a 7B model on 1T tokens and the training time on 256 A100s at 50% MFU?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 1}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2686", "title": "Training Compute Budget Allocation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option produces the best model according to Chinchilla scaling laws, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2687", "title": "Physical Limits on Training Cluster Scale", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why can't you keep halving training time by doubling GPUs, and what three physical ceilings limit cluster scaling?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 0}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2688", "title": "Calculating Communication-Compute Overlap Ceiling", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the AllReduce time for the 20GB gradient across 1024 GPUs over 200 Gb/s InfiniBand, and what training efficiency results?", "chain_ids": ["cloud-chain-auto-005-14"], "chain_positions": {"cloud-chain-auto-005-14": 0}, "chain_tiers": {"cloud-chain-auto-005-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2689", "title": "Scaling Strategy Under Physical Constraints", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which scaling strategy maximizes time-to-train reduction under the physical facility constraints?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 3}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2690", "title": "Recursive Halving-Doubling vs Ring AllReduce", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does recursive halving-doubling achieve AllReduce in O(log N) steps versus ring AllReduce's O(N), and when is each preferred?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 1}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2692", "title": "AllReduce Algorithm Selection for Mixed Workloads", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use different algorithms for each?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 4}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2693", "title": "Why Standard Optimizers Fail with Compressed Gradients", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does naive gradient compression such as top-k sparsification or 1-bit quantization degrade convergence with Adam?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2694", "title": "Communication Savings from 1-Bit Adam", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 1-bit Adam on a 10B model across 64 GPUs, what communication reduction and AllReduce time at 200 Gb/s do you get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2695", "title": "Compression Strategy for Bandwidth-Constrained Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach best balances convergence and throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2696", "title": "Why GPUDirect Storage Eliminates the CPU Bounce Buffer", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does GPUDirect Storage bypass CPU RAM, and why does that improve training data throughput?", "chain_ids": ["cloud-chain-auto-secondary-001-03"], "chain_positions": {"cloud-chain-auto-secondary-001-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-001-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2697", "title": "Data Loading Bottleneck Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the epoch times with and without GPUDirect Storage, and what is the true hardware bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-001-03"], "chain_positions": {"cloud-chain-auto-secondary-001-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-001-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2698", "title": "Storage Architecture for Large-Scale Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which storage architecture provides the best throughput for large-scale training, and why do the others fail?", "chain_ids": ["cloud-chain-auto-secondary-001-03"], "chain_positions": {"cloud-chain-auto-secondary-001-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-001-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2699", "title": "SSP Staleness Bound Intuition", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What does Stale Synchronous Parallel (SSP) actually guarantee, and why does its staleness bound matter for convergence?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 0}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2700", "title": "SSP Throughput vs BSP", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With BSP at 180 ms due to persistent stragglers, what throughput improvement should SSP with S=5 provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2701", "title": "Synchronous FSDP Straggler Mitigation at 256 GPUs", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 256-GPU synchronous FSDP training of a 7B LLM, which mitigation strategy (pure sync, sync + backup workers, or bounded-staleness async) preserves convergence with the smallest throughput cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2702", "title": "Linear Scaling Rule Intuition", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does scaling from 8 to 64 GPUs with an 8x larger global batch require increasing the learning rate?", "chain_ids": ["cloud-chain-auto-013-10"], "chain_positions": {"cloud-chain-auto-013-10": 0}, "chain_tiers": {"cloud-chain-auto-013-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2703", "title": "LR Warmup Duration Calculation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the target learning rate and the number of warmup steps?", "chain_ids": ["cloud-chain-auto-013-10"], "chain_positions": {"cloud-chain-auto-013-10": 1}, "chain_tiers": {"cloud-chain-auto-013-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2704", "title": "When Linear Scaling Breaks Down", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For ResNet-50 at batch size 65,536 diverging after warmup, should you use sqrt LR scaling, LARS, or reduce batch size?", "chain_ids": ["cloud-chain-auto-013-10"], "chain_positions": {"cloud-chain-auto-013-10": 2}, "chain_tiers": {"cloud-chain-auto-013-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2705", "title": "Critical Batch Size Concept", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is critical batch size, and why does doubling GPUs not guarantee a 2x training speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2706", "title": "Compute Efficiency Beyond B_crit", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "At B_crit=4096, what time savings and compute efficiency should you expect when scaling to B=16384 on 64 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2707", "title": "Budget Allocation Near B_crit", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With a $500K budget and B_crit near 8K, should you choose 128 GPUs at B=32K for 2 weeks or 32 GPUs at B=8K for 6 weeks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2708", "title": "Parameter Server vs AllReduce", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the key architectural difference between parameter servers and AllReduce, and why do modern frameworks favor AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2709", "title": "PS Bandwidth Bottleneck", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 32 workers, 4 parameter servers, 100 Gbps Ethernet, and 2GB gradients per worker, what is the minimum push time and bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2710", "title": "PS for Sparse Recommendation Models", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a recommendation model with a 500GB embedding table and under 1% access per batch, should you use AllReduce sharding or parameter servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2711", "title": "Data Parallelism Around an Expert-Parallel MoE Block", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "When DP wraps an already expert-parallel 64-GPU MoE block, what does the DP layer actually replicate, and what AllReduce cost does it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2713", "title": "Expert Load Imbalance Diagnosis", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 64-expert top-2 MoE with 40% MFU and highly skewed expert loads, should you add load loss, raise capacity, or use hash routing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2714", "title": "Bathtub Curve Phases", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "When should you expect the highest failure rates, and what are the three phases of the bathtub curve?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2715", "title": "Burn-in Cost vs Failure Cost", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should you burn-in?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2716", "title": "Fleet Refresh Timing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 3-year-old 2,000-GPU cluster entering wear-out, should you replace the fleet now, replace failures reactively, or roll replacements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2717", "title": "MTBF vs Cluster MTBF", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "With a 100,000-hour per-GPU MTBF and 1,000 GPUs, why is cluster-level MTBF dramatically lower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2718", "title": "Optimal Checkpoint Interval from MTBF", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using the Young/Daly formula, what is the optimal checkpoint frequency for 512 GPUs with a 50,000-hour per-GPU MTBF?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2719", "title": "MTBF-Aware Cluster Sizing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For training a 70B model on 1024 GPUs for 14 days or 512 GPUs for 28 days, which loses less time to failures and checkpointing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2720", "title": "Permanent vs Transient Faults", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What type of fault is a GPU matmul error that persists after process restart but disappears on another GPU, and why does it matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2721", "title": "Permanent Fault Impact on Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "After 72 hours of a 256-GPU run with 6-hour checkpoints, how long does recovery take from a permanent GPU fault?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2722", "title": "Silent Data Corruption Strategy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "After 18 hours of silent data corruption from a permanent GPU fault, should you roll back, continue, or add online gradient checksumming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2723", "title": "Why Intermittent Faults Are Hardest", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why are sporadic crashes on GPU", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2725", "title": "Intermittent Fault Policy Design", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 20 GPUs with intermittent faults in a 2,000-GPU cluster, should you replace all, auto-quarantine after three strikes, or add hot spares?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2726", "title": "What Causes a Checkpoint Storm", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is a checkpoint storm, and why does checkpointing on 1,024 GPUs take 30 minutes instead of 5?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 1}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2727", "title": "Checkpoint Storm I/O Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For a 512-GPU FSDP checkpoint with 280MB shards and 200 GB/s Lustre, what write time do you expect with and without staggered writes?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 2}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2728", "title": "Checkpoint Storm Mitigation Strategy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 50 jobs causing 15-minute filesystem brownouts during checkpoints, should you stagger schedules, use local NVMe plus async copy, or add storage?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 4}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2729", "title": "Five Levels of ML Networking", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the five-level networking model, and why does each level matter for ML cluster workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2730", "title": "Bisection Bandwidth at Each Level", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 3-tier fat-tree with 400 Gbps links, 16 leaves, and 256 nodes, what are the bisection bandwidth and per-node AllReduce bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2731", "title": "Networking Level Mismatch Diagnosis", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "In a full-bisection 400G RDMA fat-tree with AllReduce at 40% of theoretical, where might the bottleneck be and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2732", "title": "Link Budget Fundamentals", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is a link budget, and why does it determine whether 400G can run over 5 meters of passive copper?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2735", "title": "Failure Domain Concept", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is a failure domain, and why does a ToR switch taking down 16 GPU nodes matter for ML job placement?", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 0}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2736", "title": "Failure Domain Probability", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the probability of at least one ToR failure during a 30-day 128-GPU training run.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2737", "title": "Failure Domain vs Locality Trade-off", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 64-GPU job, should you place all 8 nodes in one rack for 30% faster AllReduce or spread them across 4 racks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2738", "title": "ML Technical Debt Sources", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is ML technical debt, and what sources make it different from traditional software technical debt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2739", "title": "Technical Debt Maintenance Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the refactoring break-even point and annual savings, and should the team prioritize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2741", "title": "Capacity Planning Fundamentals", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is doubling the GPU budget not enough when model count grows from 50 to 100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2742", "title": "Training vs Serving GPU Split", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What minimum GPU allocation is required for training and serving at a 70% utilization target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2743", "title": "On-Prem vs Cloud Capacity Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which of the on-prem (300 GPUs), hybrid burst (100 on-prem + 200 cloud), or all-cloud options minimizes 3-year cost, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2744", "title": "Continuous Training Triggers", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What should trigger continuous training for the declining CTR model, and how would it reduce staleness?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 0}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2745", "title": "Continuous Training Cost Model", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the weekly GPU cost change, and does continuous training save money once monitoring, data pipeline, and validation overhead are included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2746", "title": "Continuous Training Safety", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate shadow testing, human approval, and automated canaries for this 3x/week auto-deploy pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2747", "title": "Validation Gate Purpose", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why should the new model pass automated validation gates beyond a 2% aggregate accuracy gain before deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2748", "title": "Validation Gate Latency Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the weekly validation time sequentially, and how would you parallelize the 5 gates to cut wall time by at least half?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2750", "title": "Monitoring Hierarchy Layers", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would hierarchical monitoring help triage the 15% revenue drop when system metrics are green?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2752", "title": "Monitoring Architecture for Multi-Model Fleet", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design monitoring for 500 models, and which of per-model dashboards, anomaly detection, or SLOs should be primary?", "chain_ids": ["cloud-chain-auto-001-13"], "chain_positions": {"cloud-chain-auto-001-13": 2}, "chain_tiers": {"cloud-chain-auto-001-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2753", "title": "Alert Fatigue in ML Systems", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is alert fatigue, and why did 200+ daily alerts make the 12-hour missed ML degradation likely?", "chain_ids": ["cloud-chain-auto-001-13"], "chain_positions": {"cloud-chain-auto-001-13": 0}, "chain_tiers": {"cloud-chain-auto-001-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2756", "title": "NAS Search Space and Cost", "topic": "neural-architecture-search", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does NAS search-space design matter more than the search algorithm, and why is NAS so computationally expensive?", "chain_ids": ["cloud-chain-auto-secondary-015-09"], "chain_positions": {"cloud-chain-auto-secondary-015-09": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2757", "title": "NAS Compute Budget", "topic": "neural-architecture-search", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the total GPU-hours and dollar cost of this weight-sharing NAS run at $3 per GPU-hour?", "chain_ids": ["cloud-chain-auto-secondary-015-09"], "chain_positions": {"cloud-chain-auto-secondary-015-09": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2758", "title": "NAS vs Manual Design", "topic": "neural-architecture-search", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With 2 weeks, 1000 GPU-hours, and a 200ms edge latency limit, which model-selection option would you choose and why?", "chain_ids": ["cloud-chain-auto-secondary-015-09"], "chain_positions": {"cloud-chain-auto-secondary-015-09": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2759", "title": "Feedback Loop Formation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How did the recommender's training feedback loop form, and why does it produce filter bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2761", "title": "Feedback Loop Mitigation Strategy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate exploration, causal debiasing, and demographic parity for mitigating the lending feedback loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2762", "title": "Guardrail Architecture Layers", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What guardrails would prevent medical advice, pricing leaks, and off-topic replies, and where do they sit in the pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2763", "title": "Guardrail Latency Budget", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Do the input/output classifiers and regex filter fit within the 2-second chatbot latency budget, and how can you optimize them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2765", "title": "Why OOD Detection Matters", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is OOD detection needed for nighttime infrared images, and why is high softmax confidence insufficient?", "chain_ids": ["cloud-chain-auto-secondary-017-10"], "chain_positions": {"cloud-chain-auto-secondary-017-10": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2767", "title": "OOD Detection Strategy Selection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use temperature scaling, deep ensembles, or feature-density estimation for safety-critical OOD detection, and why?", "chain_ids": ["cloud-chain-auto-secondary-017-10"], "chain_positions": {"cloud-chain-auto-secondary-017-10": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2770", "title": "Prompt Injection Defense Architecture", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "You must choose a defense: (A) classifier-based input filtering, (B) sandboxed tool execution with human-in-the-loop, or (C) an instruction hierarchy where tool-call requests from user content are always rejected. How would you evaluate classifier filtering, sandboxed tools, and instruction hierarchy for tool-use prompt injection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2771", "title": "ML Attack Surface", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is the ML model's threat model larger than a traditional software threat model, and what ML-specific attacks should it cover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2772", "title": "Threat Model Risk Matrix", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What four ML-specific threats would you include for the fraud detector, and how would you score their likelihood, impact, and risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2774", "title": "How Adversarial Evasion Works", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How do adversarial stickers make a vision model misclassify a stop sign, and why can small perturbations flip predictions?", "chain_ids": ["cloud-chain-auto-secondary-015-21"], "chain_positions": {"cloud-chain-auto-secondary-015-21": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2775", "title": "Adversarial Training Compute Overhead: Adversarial Robustness & Security", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much longer and more expensive will 10-step PGD adversarial training be than the 24-hour standard run?", "chain_ids": ["cloud-chain-auto-secondary-015-21"], "chain_positions": {"cloud-chain-auto-secondary-015-21": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2776", "title": "Adversarial Defense Selection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defenses should you use against structured-feature evasion attacks in production fraud detection, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-21"], "chain_positions": {"cloud-chain-auto-secondary-015-21": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2777", "title": "System Prompt Extraction Risk", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is system prompt leakage a security risk, and why is telling the LLM not to reveal its prompt insufficient?", "chain_ids": ["cloud-chain-auto-004-13"], "chain_positions": {"cloud-chain-auto-004-13": 0}, "chain_tiers": {"cloud-chain-auto-004-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2778", "title": "Injection Detection Accuracy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "At 100,000 queries/day with a 0.1% attack rate, what are the classifier's daily true positives, false positives, and precision?", "chain_ids": ["cloud-chain-auto-004-13"], "chain_positions": {"cloud-chain-auto-004-13": 1}, "chain_tiers": {"cloud-chain-auto-004-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2779", "title": "LLM Injection Defense Architecture", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you evaluate fine-tuning, dual-instance isolation, and classifiers for prompt injection in a RAG application?", "chain_ids": ["cloud-chain-auto-004-13"], "chain_positions": {"cloud-chain-auto-004-13": 2}, "chain_tiers": {"cloud-chain-auto-004-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2780", "title": "WUE Metric Explained", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is Water Usage Effectiveness (WUE), and why is cooling AI training clusters a growing water sustainability concern?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 0}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2782", "title": "Cooling Strategy for Water-Scarce Region", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 10 MW IT-load datacenter in a water-scarce desert, which cooling option provides the best TCO and sustainability balance, and why?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 3}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2783", "title": "Embodied Carbon Concept", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is embodied carbon, and why does extending AI hardware lifespan effectively reduce annualized emissions?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 0}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2784", "title": "Fleet Lifecycle Carbon Calculation", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 1,000 A100 fleet, what is the annualized total carbon under 3-year versus 5-year replacement cycles?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 1}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2785", "title": "Upgrade vs Extend Hardware Lifecycle", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 1,000-GPU fleet, which replacement option minimizes the 5-year total carbon footprint, and why?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 3}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2786", "title": "The Divergence Problem", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why won't Moore's Law keep AI energy consumption in check as model compute demand grows?", "chain_ids": ["cloud-chain-auto-secondary-009-17"], "chain_positions": {"cloud-chain-auto-secondary-009-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2787", "title": "Energy Gap Projection", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What power will an equivalent 2030 frontier training run require, and how does it compare to a 100 MW datacenter?", "chain_ids": ["cloud-chain-auto-secondary-009-17"], "chain_positions": {"cloud-chain-auto-secondary-009-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2788", "title": "Strategies Against Divergent Scaling", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate on-site renewables, algorithmic efficiency, and power-abundant regions for this power-scaling problem?", "chain_ids": ["cloud-chain-auto-secondary-009-17"], "chain_positions": {"cloud-chain-auto-secondary-009-17": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2789", "title": "Cloud New 0002", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What 3D parallelism strategy would you use to train the 500B dense model to maximize hardware utilization and minimize communication bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2790", "title": "Cloud New 0003", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design asynchronous checkpointing for a 175B model on 256 H100s without stalling training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2791", "title": "Cloud New 0004", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a tiered KV-cache for a 70B LLM with 100k context on an 8x H100 node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2792", "title": "Cloud New 0005", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a hybrid AOT/JIT compiler for a 13B model with sequence lengths from 10 to 8000?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 4}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2793", "title": "Cloud New 0006", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you map expert parallelism for a 300B MoE model across 128 nodes to avoid all-to-all congestion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2794", "title": "Cost-Optimal Heterogeneous Serving for 30B LLMs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design disaggregated serving for a 30B LLM using A100s for prefill and T4s for decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2796", "title": "Cloud New 0009", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a decoupled H100 serving pipeline for a vision encoder feeding an autoregressive LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2798", "title": "Cloud New 0013", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you compile and optimize a 7B model for real-time inference on Orin devices with 275 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2799", "title": "Privacy-Preserving Cross-Tenant Data-Parallel Aggregation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect privacy-preserving 3-region data-parallel aggregation for a 70B model on 192 H100s with secure cross-region averaging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2800", "title": "Cloud New 0015", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design serverless 7B LLM inference on idle A100s with low cold-starts using PCIe Gen4 weight offload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2801", "title": "Cloud New 0016", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you detect silent data corruption and hardware degradation across 10,000 GPUs without interrupting training?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 3}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2802", "title": "Sequence Parallelism for 100k Contexts on 1T Models", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you combine sequence parallelism with Megatron-style training to handle 100k contexts for a 1T model without hitting HBM3 limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2803", "title": "Cloud New 0019", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you make Ring All-Reduce gradient synchronization resilient to flaky 400Gbps IB links for a 50B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2804", "title": "Optimizing MoE Routing Kernels for Sparse Accesses on H100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you auto-tune custom MoE routing kernels on H100s to maximize HBM3 bandwidth for sparse accesses?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 3}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2805", "title": "Cloud New 0022", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule 100B training across reserved H100s and volatile spot V100s to reduce cost while guaranteeing progress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2806", "title": "Zero-Overhead Distributed Tracing for LLM Serving", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you trace a 70B LLM serving stack across A100 nodes to debug P99 spikes in TP syncs and KV-cache allocations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2807", "title": "Heterogeneous-Bandwidth Data-Parallel AllReduce Architecture", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect data-parallel AllReduce on 128 A100s split between an InfiniBand HDR island and an Ethernet half so synchronization fits inside a 380 ms backward pass?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 5}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2808", "title": "Cloud New 0026", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a pipeline parallel serving architecture to maximize throughput and minimize latency for a 7B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2809", "title": "Cloud New 0027", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a shared KV-cache for 1000s of AI NPC users who share a world-state prompt but branch into unique dialogues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2810", "title": "Cloud New 0028", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you migrate KV-cache state for preemptible H100 LLM serving without losing sessions or causing large latency spikes?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 3}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2811", "title": "Pipeline Parallelism across Asymmetric Clusters", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you fine-tune a 65B model across 4 V100 32GB nodes and 2 A100 80GB nodes without stragglers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2812", "title": "Auto-Scaling and Routing for 7B Chat Model on Mixed T4 Fleet", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you autoscale and route traffic for a 7B chat model with 10x daily swings using reserved and spot T4s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2819", "title": "NVLink vs PCIe Gen4 Bandwidth on A100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the primary use cases and bandwidths of NVLink versus PCIe Gen4 on an A100 system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2821", "title": "Cloud New 0011", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is Data Parallelism, and what happens to the model weights across GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2837", "title": "Cloud New 0031", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is Operator Fusion, and why does it improve performance for memory-bound workloads?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2838", "title": "Cloud New 0032", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is JIT compilation, and how does it differ from AOT compilation in PyTorch-style ML workloads?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 0}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2839", "title": "Cloud New 0033", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does XLA stand for, and what is its primary purpose in an ML stack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2840", "title": "Cloud New 0034", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is Constant Folding, and how does it simplify a computational graph before inference?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 0}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2841", "title": "Cloud New 0035", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is auto-tuning in ML compilation, and how does it improve kernel performance?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 0}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2842", "title": "Asynchronous Checkpointing vs Synchronous Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is asynchronous checkpointing, and how does it mitigate compute stalls compared to synchronous checkpointing?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 0}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2843", "title": "Cloud New 0038", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a heartbeat mechanism, and how does a cluster orchestrator detect a dead node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2844", "title": "Cloud New 0039", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Define Elastic Training, and state its primary benefit when a node fails in the middle of a distributed job.", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 0}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2845", "title": "Pipeline Bubble in Pipeline Parallelism", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the pipeline bubble in pipeline-parallel training, and why does it occur during the initial forward pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2847", "title": "Operating Point on the Queueing Hockey-Stick", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Using the M/M/1 queueing model, what utilization brings P99 latency under the 50 ms SLO, and why does latency spike as utilization approaches 1?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 3}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2849", "title": "Incast Network Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the 28ms latency based on the network bandwidth constraints of the aggregator?", "visual": {"kind": "svg", "path": "cloud-2849.svg", "alt": "A diagram showing multiple worker nodes simultaneously sending 200 MB of data to a single aggregator node, creating an incast bottleneck at the aggregator's 400 Gbps NIC.", "caption": "Simultaneous fan-in (incast) traffic overwhelming a single receiver link."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2850", "title": "MoE Topology Oversubscription", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the inter-node All-to-All latency for both the 1:1 and 2:1 topologies, and specify whether the 2:1 design meets a strict 50 ms communication SLO.", "visual": {"kind": "svg", "path": "cloud-2850.svg", "alt": "A two-tier leaf-spine network topology diagram showing 2 spine switches connected to 4 leaf switches, with each leaf switch connected to 2 compute nodes (8 GPUs each), illustrating a 2:1 oversubscription ratio.", "caption": "2:1 Oversubscribed Leaf-Spine Interconnect Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2851", "title": "Raw Tensor Network Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the data loading throughput to diagnose the root cause of the low utilization.", "visual": {"kind": "svg", "path": "cloud-2851.svg", "alt": "Bar chart comparing 6.25 GB/s network capacity against 32.2 GB/s required for raw FP16 images, and ~1 GB/s for JPEG.", "caption": "Throughput limits and demands for the data loading pipeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2852", "title": "Checkpoint Efficiency and SLA Targets", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether the current storage backend can meet the 90% efficiency SLA using optimal checkpointing, and if not, specify the minimum write bandwidth required.", "visual": {"kind": "svg", "path": "cloud-2852.svg", "alt": "Timeline diagram illustrating normal operation with checkpoints, a crash failure, the lost work interval (RPO), and the recovery time (RTO).", "caption": "Checkpointing timeline showing the tradeoff between interval frequency (T) and the lost work upon failure."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2854", "title": "HBM Cache Hit Rate", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the minimum cache hit rate required in HBM to achieve an effective average embedding lookup bandwidth of at least 1.6 TB/s?", "visual": {"kind": "svg", "path": "cloud-2854.svg", "alt": "A bar chart comparing CPU DRAM bandwidth at 64 GB/s, effective target at 1600 GB/s, and HBM3 at 3200 GB/s on a logarithmic scale.", "caption": "Memory hierarchy bandwidth tiering comparison."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2857", "title": "M/M/1 Accelerator Queuing", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At what arrival rate will the average response time spike to exactly 40 ms?", "visual": {"kind": "svg", "path": "cloud-2857.svg", "alt": "A hockey-stick line graph of an M/M/1 queue showing average latency exponentially rising as arrival rate approaches 150 requests per second.", "caption": "Queueing theory hockey-stick curve for M/M/1 latency."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2859", "title": "Multimodal Pipeline Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Calculate the required S3 network bandwidth in GB/s and determine the minimum number of CPU cores required if each core decodes 200 images/second.", "visual": {"kind": "svg", "path": "cloud-2859.svg", "alt": "Bar chart comparing the throughput capacities of S3 read, CPU decode, PCIe Gen5 transfer, and GPU processing stages against a target horizontal line.", "caption": "Data Pipeline Throughput vs Bottleneck Target"}, "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 4}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2860", "title": "Cloud GPU Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a duty-cycling schedule with a 1-hour buffer around the peak, and compute the daily kWh saved compared to keeping all 100 GPUs fully active 24/7.", "visual": {"kind": "svg", "path": "cloud-2860.svg", "alt": "A step plot showing the number of active GPUs increasing before a traffic peak and dropping immediately after, overlaid on a smooth traffic demand curve.", "caption": "Proactive Duty Cycling Over 24 Hours"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2862", "title": "High-Frequency Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Determine the system's Mean Time Between Failures (MTBF) and explain why standard synchronous checkpointing is unviable here.", "visual": {"kind": "svg", "path": "cloud-2862.svg", "alt": "A horizontal bar chart timeline illustrating the extreme overlap of checkpointing, compute, and recovery times when system MTBF is nearly equal to checkpoint duration.", "caption": "Synchronous Checkpointing Overhead Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2870", "title": "Gradient Sync Bucket Spec", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What measurements are needed before tuning gradient bucket size for communication-computation overlap?", "chain_ids": ["cloud-chain-auto-023-13"], "chain_positions": {"cloud-chain-auto-023-13": 2}, "chain_tiers": {"cloud-chain-auto-023-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2878", "title": "Estimating Activation Memory for a Transformer Layer on H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How much activation memory will the forward pass consume across all 72 layers, and will it fit alongside fp16 70B weights?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 2}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2879", "title": "Designing a Gradient Checkpointing Segment Strategy for a 70B Model on A100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What gradient checkpointing strategy keeps activations under 28 GB per GPU while minimizing recomputation for this fine-tune?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 1}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2880", "title": "Designing Activation Offloading vs. Checkpointing Trade-off for Long-Sequence Training on H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Should you use checkpointing every 4 layers or PCIe 5.0 activation offload to keep the 32K-token step under 2 seconds, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2881", "title": "Evaluating Gradient Checkpointing Overhead on Training Throughput for GPT-Scale Models", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is the 21.6% throughput loss from checkpointing acceptable for the OOMing 13B job, and does it match theoretical recompute overhead?", "chain_ids": ["cloud-chain-auto-027-24"], "chain_positions": {"cloud-chain-auto-027-24": 2}, "chain_tiers": {"cloud-chain-auto-027-24": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2882", "title": "Evaluating Activation Memory Profiler Output for Silent Memory Fragmentation", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate whether activation memory fragmentation is the cause and determine a remediation path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2883", "title": "Implementing Gradient Checkpointing with Custom Autograd for a Non-Standard Layer", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you implement fine-grained MoE checkpointing so gating logits are discarded and only routing is recomputed in backward?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2885", "title": "Optimizing FlashAttention vs. Standard Attention Activation Footprint at Scale", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much activation memory does FlashAttention save at S=8192, and does it eliminate the need for gradient checkpointing within an 80 GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2886", "title": "Optimizing Activation Memory for Pipeline-Parallel Training Across A100 Nodes", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does the first 1F1B pipeline stage use 62 GB of activations while the last stage uses so much less memory, and how would you reduce the imbalance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2887", "title": "Realizing Activation Checkpointing in a ZeRO-3 + Tensor-Parallel Hybrid Configuration", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What happens to the 22 GB/GPU activation memory when DeepSpeed activation checkpointing is enabled, and how does it interact with TP=4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2888", "title": "Realizing Selective Layer Activation Storage for Inference-Time KV Cache vs. Training Activations", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is inference always lower-memory than training at the same batch and sequence length, and when can KV cache exceed training activations?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 4}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2889", "title": "Specifying Activation Memory Budget for a Multi-Tenant Training Cluster on H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What activation memory allocation policy should the scheduler enforce for jobs across model size, sequence length, batch, and checkpointing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2890", "title": "Specifying Gradient Checkpointing Requirements in a Model Training SLA", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What checkpointing and sharding configuration satisfies the 90% HBM, 350 tokens/sec/GPU, and 500B-token-in-60-days constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2891", "title": "Analyze depthwise separable convolution parameter reduction on ResNet-50 baseline", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What FLOP speedup would replacing 3x3 convs with depthwise-separable layers give, and is that alone enough to justify retraining?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 2}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2892", "title": "Analyze EfficientNet compound scaling vs. naive width/depth scaling on throughput", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which of EfficientNet-B4, widened ResNet-50, or deepened ResNet-101 has the best accuracy-per-FLOP and throughput?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 1}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2893", "title": "Analyze inverted residual bottleneck memory access patterns on A100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the MobileNetV2 expand layer the bottleneck for 112x112x32 inputs on A100, or are depthwise/project layers more bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2894", "title": "Design an efficient CNN serving pipeline for H100 multi-tenant inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a batching and memory management strategy that maximizes single-GPU utilization while keeping p99 latency under 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2895", "title": "Design a FLOP-accuracy Pareto frontier experiment for MobileNet family selection", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the model-selection experiment to find the FLOP-accuracy Pareto frontier and serving cost for 200K QPS on 32 A100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2896", "title": "Design EfficientNetV2 training pipeline for large-scale cloud fine-tuning", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What distributed training, gradient accumulation, and memory plan would let EfficientNetV2-XL fine-tune on 50M images in under 48 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2897", "title": "Diagnose low GPU utilization when serving MobileNetV3 on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is MobileNetV3-Large only reaching 12% SM utilization at 5,000 QPS batch=1 on H100, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2898", "title": "Diagnose accuracy degradation after replacing standard convs with depthwise separable", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing ResNet-34 3x3 convolutions with depthwise separable convolutions hurt ImageNet accuracy, and how can it be recovered?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2899", "title": "Diagnose NaN losses during EfficientNet training with aggressive augmentation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What failure chain could make EfficientNet-B5 with BF16 and Mixup diverge at SE blocks, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2900", "title": "Evaluate MobileNetV3 vs EfficientNet-B0 accuracy-latency tradeoff for A100 batch serving", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which model gives better cost per unit accuracy under a 30 ms p99 A100 TensorRT INT8 SLA: MobileNetV3-Large or EfficientNet-B0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2901", "title": "Evaluate EfficientNet compound scaling coefficient impact on H100 memory bandwidth", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the memory bandwidth utilization at each scaling level, where is the saturation point, and where do accuracy gains saturate?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 2}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2902", "title": "Recall depthwise separable convolution FLOP formula", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How do you compute the FLOPs for a standard 3x3 convolution versus a depthwise separable convolution with Cin=Cout=128 at 28x28, and what is the reduction ratio?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2903", "title": "Recall MobileNetV2 inverted residual structure and why it inverts the bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the architectural difference between a ResNet bottleneck and a MobileNetV2 inverted residual block, and why use a linear bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2904", "title": "Recall EfficientNet compound scaling constraints and baseline architecture", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does EfficientNet's compound scaling differ from simply widening or deepening a baseline model like B0, and what do the coefficients represent?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 0}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2905", "title": "Implement INT8 quantization for MobileNetV3 on TensorRT with calibration", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calibrate and convert MobileNetV3-Large to TensorRT INT8 on A100 and verify the accuracy drop stays below 1%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2906", "title": "Optimizing Depthwise Convolution with Shared Memory on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you optimize the C=512, 14x14, batch-32 depthwise 3x3 layer on H100 beyond cuDNN's 18% SM utilization?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 3}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2907", "title": "Implement channel pruning for EfficientNet-B3 to reduce serving costs", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you prune EfficientNet-B3 by 40% serving cost while retaining at least 81% top-1, and how would you validate it in TensorRT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2908", "title": "Mastery: explain why inverted residuals outperform standard bottlenecks on memory-bandwidth-limited hardware", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would a roofline model argument using arithmetic intensity explain why inverted residuals outperform standard bottlenecks on bandwidth-limited hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2909", "title": "Mastery: trade-off analysis of SE block overhead vs accuracy gain in EfficientNet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the total SE-module FLOP overhead in EfficientNet-B0, and would removing SE plus width scaling lower H100 cost at the same accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2910", "title": "Mastery: knowledge distillation from EfficientNet-B7 to MobileNetV3 at scale", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you distill EfficientNet-B7 into MobileNetV3-Large, and why is intermediate feature distillation needed beyond soft labels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2911", "title": "Optimize depthwise separable convolution throughput via kernel fusion on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you fuse kernels in the MobileNetV2 expand-depthwise-project block to reduce 47 launches and memory transactions on H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2912", "title": "Optimize EfficientNet training throughput with gradient checkpointing and mixed precision", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What speedup and throughput do batch=256, BF16, selective checkpointing, and CUDA graphs provide, and does it meet the 1,200 images/sec/GPU target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2913", "title": "Optimize MobileNetV3 width multiplier selection for a fixed A100 latency budget", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What MobileNetV3-Large width multiplier best improves accuracy while keeping batch-128 latency under the 2ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2914", "title": "Realize end-to-end EfficientNet-B4 serving with DALI preprocessing on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you build the GPU preprocessing and inference pipeline for EfficientNet-B4 to serve 2,000 QPS of 1080p JPEGs on one H100 node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2915", "title": "Realize MobileNetV2 export pipeline from PyTorch to ONNX to TensorRT", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the complete export chain from PyTorch to TensorRT FP16, and what are the known failure points at each step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2916", "title": "Realize automated NAS-style width and depth search for EfficientNet-style backbone", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you realize a once-for-all NAS pipeline under the budget, and what is the maximum per-epoch time constraint if 22,500 seconds is allocated to training?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 3}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2917", "title": "Specification: define latency-accuracy SLA for efficient CNN cloud API", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What hardware, model, batching, and failure-mode spec would you choose to serve 50K QPS under the p50, p99, and >80% ImageNet SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2918", "title": "Fluency: explain depthwise separable convolution to a non-ML systems engineer", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do depthwise separable convolutions differ from standard convolutions, why are they faster, and how can they change model behavior?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2919", "title": "RAG Pipeline Latency Budget on H100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the 380 ms RAG TTFT being spent, which stage is the bottleneck, and what changes get P95 under 300 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2920", "title": "Router Model Accuracy vs Latency Tradeoff in Multi-Model Pipeline", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is 91% router accuracy sufficient for the GPT-4-class versus 7B routing setup, given expected latency, P95, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2921", "title": "Explaining Compound AI Pipeline Behavior to Non-Technical Stakeholders", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can the 5-stage agentic RAG pipeline give factual errors even when each stage's metrics look healthy in isolation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2922", "title": "Mastering KV Cache Sharing Across Agents in Multi-Model Orchestration", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "If the optimized pipeline costs 42% of the original baseline, what is the actual cost reduction, and why might someone incorrectly claim 37% savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2923", "title": "End-to-End Latency SLA for Multi-Hop Agent Chains on A100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which hops in the 4-hop agent chain can be parallelized, and what P95 latency is achievable after restructuring toward the 500 ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2924", "title": "Memory Capacity Planning for Concurrent RAG Agents on H100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Do the 16 concurrent 13B agent sessions plus shared 7B reranker fit on one 8xH100 node, and how would you allocate memory with tensor parallelism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2925", "title": "Cold Start Latency for Serverless Compound AI on A100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What keep-warm strategy would reduce the 18s cold start for the bursty serverless deployment without paying for always-on GPU time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2926", "title": "Implementing Semantic Caching for RAG on H100 to Reduce LLM Calls", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a semantic cache for the 50,000-query/day RAG system, and how should it invalidate entries after 6-hour KB updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2927", "title": "K8s GPU Device Plugin: Why Pod Requests Whole GPUs", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does nvidia.com/gpu: 1 allocate an entire GPU, how does the device plugin expose GPUs, and why aren't fractions default?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 0}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2928", "title": "Specifying a Carbon-Aware Cooling Control Policy for H100 Clusters", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you specify a cooling control policy that dynamically switches between cooling modes based on both ambient temperature and grid carbon intensity to minimize operational carbon footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2929", "title": "Designing a Volcano Job for Multi-Pod Distributed Training", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What Kubernetes scheduling configuration prevents partial-allocation deadlock for 64-GPU PyTorch DDP jobs sharing the H100 cluster with inference workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2930", "title": "Specifying Resource Limits to Prevent GPU Memory OOM Eviction", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What Kubernetes resource and readiness configuration would reduce vLLM OOMKilled restarts on A100 nodes without over-provisioning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2931", "title": "Implementing a Custom Kubernetes Scheduler Extender for GPU Topology", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you implement a Kubernetes scheduler extension that scores H100 nodes by GPU-to-GPU bandwidth topology for the 4-node training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2932", "title": "Evaluating Gang Scheduling vs. Elastic Training Tradeoffs", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For week-long fine-tunes on the 128-H100 cluster with a 5% weekly node failure rate, should you use Volcano gang scheduling or elastic training, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2933", "title": "Evaluating Horizontal Pod Autoscaler for GPU Inference Scaling", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is HPA on GPU utilization the right autoscaling signal for bursty 2–3 minute vLLM traffic spikes, and what should you use instead?", "chain_ids": ["cloud-chain-auto-021-13"], "chain_positions": {"cloud-chain-auto-021-13": 2}, "chain_tiers": {"cloud-chain-auto-021-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2934", "title": "Fluency: Translating a PyTorch DDP Job into a Kubernetes Manifest", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you translate the 4-node, 8-GPU-per-node torchrun command into a Training Operator PyTorchJob manifest?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2935", "title": "Realizing Cluster Autoscaler Behavior with GPU Node Groups", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do GPU nodes in the EKS cluster take 8–12 minutes to join after a pod goes pending, and what steps make up that delay?", "chain_ids": ["cloud-chain-auto-021-13"], "chain_positions": {"cloud-chain-auto-021-13": 0}, "chain_tiers": {"cloud-chain-auto-021-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2936", "title": "Realizing Why GPU Jobs Hang After Node Preemption", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is happening when the job hangs after preemption, and how would you detect this failure proactively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2937", "title": "Analyzing Pod Scheduling Latency Under Quota Pressure", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Team A’s GPU pods take 15–20 minutes to schedule while Team B’s schedule quickly despite both being within quota?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2938", "title": "Analyzing RBAC and Security Context for Multi-Tenant GPU Namespaces", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the blast radius of the cluster-wide DCGM Exporter, and how would you design RBAC and GPU metric isolation between namespaces?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2939", "title": "Mastery: Designing a Zero-Downtime GPU Cluster Upgrade Strategy", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you roll out the NVIDIA 525-to-550 driver upgrade across 64 H100 nodes without >5 minutes inference downtime or lost training progress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2940", "title": "Recall: What PUE Measures and Why 1.0 Is Physically Impossible", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why is a PUE of 1.3 targeted instead of 1.0 for the datacenter, and is a lower PUE always achievable or better?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 0}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2941", "title": "Recall: Rack Power Budget Limits for H100 Dense Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Can a standard 15 kW rack feasibly host two 10.2 kW DGX H100 nodes, and what datacenter constraints apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2942", "title": "Realizing How Liquid Cooling Changes PUE for H100 Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Where would direct liquid cooling save power versus the current PUE 1.55 air-cooled H100 cluster, and what infrastructure changes are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2943", "title": "Realizing Carbon-Aware Scheduling: When to Shift GPU Workloads", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should carbon-aware scheduling route the 7-day, 512-GPU H100 training job between US-West and US-East, and what grid-signal infrastructure is needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2944", "title": "Realizing Power Capping vs. Thermal Throttling on H100", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might a 500 W power cap cause a 30% throughput drop instead of the expected 15%, and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2945", "title": "Implementing Power-Aware Bin Packing for Heterogeneous GPU Racks", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a power-aware placement algorithm that maximizes H100 and A100 GPU density per rack without exceeding 20 kW at P95 load?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 3}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2946", "title": "Implementing Carbon-Aware Job Queue with WattTime Integration", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you implement a WattTime-driven Kubernetes scheduler that cuts Scope 2 emissions 30% without increasing average job latency over 20%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2947", "title": "Evaluating PUE Measurement Methodologies: Instantaneous vs. Annual", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which PUE quote is more meaningful for H100 hosting, Vendor A’s instantaneous 1.2 or Vendor B’s ISO annual 1.45, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2948", "title": "Evaluating Free Cooling Feasibility for H100 Liquid-Cooled Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is year-round free cooling viable for a DLC-equipped H100 cluster in Austin with <= 45°C inlet water, and how many hours need mechanical assist?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2949", "title": "Evaluating Total Carbon Cost of H100 vs. TPU v5e for LLM Training", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the total carbon footprint of pre-training the 7B LLM on US-East GPUs versus Google TPU v5e, and which option is lower-carbon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2950", "title": "Diagnosing Unexpected PUE Spike in H100 Cluster", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically diagnose a weekend PUE spike from 1.35 to 1.72 when H100 GPU utilization stayed at 85% and cooling had no alerts?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 2}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2951", "title": "Optimization: Reducing Stranded Power Through Dynamic GPU Power Capping", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you dynamically power-cap the 128-GPU A100 cluster so all GPUs stay powered while total IT power remains under 45 kW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2952", "title": "Specification: Designing Rack Power Distribution for MI300X Dense Cluster", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What rack layout, PDU configuration, circuits, and breaker sizing would you specify for the 64-node MI300X cluster with N+1 redundancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2953", "title": "Dataset Curation: Why Data Quality Degrades Model Calibration on H100", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can near-duplicate web-scraped text cause ECE > 0.15 in the 70B LLM, and what data statistics would reveal the problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2954", "title": "Dataset Curation: Design a Multi-Stage Quality Pipeline for LLM Pre-Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a multi-stage pipeline to filter 50 TB of raw Common Crawl into about 5 TB of high-quality tokens within 48 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2955", "title": "Dataset Curation: Design a Labeling Pipeline for Multi-Modal Training Data", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you label 100M image-text pairs to exceed 95% accuracy in 30 days with 10 human annotators and access to GPT-4V?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2956", "title": "Dataset Curation: Evaluate Deduplication Strategies for Pre-Training Data Quality", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 20 TB pre-training dataset use exact URL dedup or 0.8-Jaccard MinHash near-dedup, given quality and runtime tradeoffs?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2957", "title": "Dataset Curation: Evaluate Domain Mixing Ratios for Instruction Tuning", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which mixing strategy should you use for the 500K code, 200K instruction, and 50K math examples, and how would you evaluate forgetting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2958", "title": "Dataset Curation: Napkin Math for Tokenizer Coverage on Multilingual Data", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What tokenizer fertility should you expect after adding 20% non-English data to an English-only 32K tokenizer, and how does it affect sequence length and memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2959", "title": "Dataset Curation: Napkin Math for Data Mixing Budget", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many of the 1.3T Chinchilla-optimal tokens should come from web, books, code, and papers after applying proportional mixing with a 2x quality multiplier for books?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2960", "title": "Dataset Curation: Implement a Data Quality Scoring Function", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design and size a KenLM perplexity scorer to process 1B documents within 6 hours on a 16-node CPU cluster?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2961", "title": "Dataset Curation: Mastery — End-to-End Pre-Training Data Strategy for 100B Model", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What complete data curation strategy (sourcing, filtering, deduplication, mixing) is required, and what throughput must the pipeline sustain?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 3}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2963", "title": "Dataset Curation: Mastery — Data Flywheel Architecture for Production LLM", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a weekly data flywheel that turns 50M daily user queries into private, high-quality, fresh training data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2964", "title": "Dataset Curation: Optimize Streaming Data Pipeline Throughput for H100 Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is starving the 256-H100 training job, and how would you fix the DataLoader and storage path to recover the expected MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2965", "title": "Dataset Curation Optimize Deduplication Pipeline Scalability", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the bottleneck in the MinHash dedup pipeline, and how would you complete the 5T-token run in under 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2966", "title": "Dataset Curation: Realize Token Budget for Chinchilla-Optimal 30B Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the full data requirements and DataLoader throughput needed for a Chinchilla-optimal 30B model on 64 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2967", "title": "Dataset Curation: Realize Multi-Modal Dataset Storage and Access Architecture", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage, indexing, and DataLoader architecture would sustain training on 1B image-text pairs for the 20B vision-language model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2968", "title": "Dataset Curation: Recall — What is Perplexity-Based Data Filtering?", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is perplexity-based filtering, and what threshold range is commonly used to classify low-quality versus high-quality documents?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2969", "title": "Dataset Curation: Specification — Design a Data Quality SLA for Continuous Pre-Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What quantitative SLAs should gate the 100M weekly documents for data quality, freshness, deduplication, PII, and pipeline throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2970", "title": "Fault Tolerance: Analyze Checkpoint Frequency vs Recovery Cost on H100 Cluster", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes expected lost time for the 70B training job with 8-minute checkpoints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2971", "title": "Fault Tolerance: Design Checkpointing Architecture for 1024-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What checkpointing strategy minimizes interruption for a 175B model on 1,024 GPUs while preserving full recovery capability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2972", "title": "Fault Tolerance: Evaluate Full vs Incremental Checkpointing for LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 2-week 30B run, how do full 30-minute checkpoints compare with 10-minute incremental checkpoints, and what strategy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2973", "title": "Fault Tolerance: Evaluate ZeRO vs Full Replica Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For ZeRO-3 on 256 H100s, should checkpointing save each rank's shard or gather the full model on rank 0, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2974", "title": "Fault Tolerance: Implement Checkpoint Size Calculation for Mixed-Precision Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the per-rank and total checkpoint sizes for the 13B ZeRO-2 job, and do they fit within 100GB of NVMe per node?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2975", "title": "Fault Tolerance: Mastery — Design Full Fault Tolerance Architecture for 1000-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What fault-tolerance architecture for checkpointing, detection, recovery, and spares will keep the 500B, 2,048-GPU run above 95% uptime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2976", "title": "Fault Tolerance: Optimize Checkpoint Write Throughput on Lustre Filesystem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Diagnose the Lustre bottleneck and explain how to tune it to reach at least 120 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2977", "title": "Fault Tolerance: Specification — Define Recovery Time Objective for LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What RTO, RPO, checkpoint cadence, storage budget, and recovery SLAs would you set for the 90-day 512-GPU training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2978", "title": "Feature Store: Design a Low-Latency Feature Store for Real-Time LLM Serving", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a feature store architecture to serve 100K requests/second with 500 features per request at < 5ms p99 latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2979", "title": "Feature Store: Diagnose Feature Staleness Causing Model Accuracy Degradation", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What root cause explains the CTR drop when a feature has zero variance, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2980", "title": "Feature Store: Evaluate Online vs Offline Feature Store Architectures", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the daily revenue impact of a 2% metric degradation due to training-serving skew if the system processes 100K RPS with an average value of $0.01 per prediction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2981", "title": "Feature Store: Evaluate Point-in-Time Correct Feature Joins for Training Data", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is a latest-value feature join risky for the fraud dataset, and how does a point-in-time correct join avoid data leakage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2982", "title": "Feature Store: Fluency — Feature Vector Size and Retrieval Latency Estimation", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much Redis memory and network bandwidth are needed for 10M users, 5M catalog items, and 50K requests per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2983", "title": "Feature Store: Mastery — Design Feature Store for Multi-Modal Real-Time LLM", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What feature store architecture meets < 3ms p99 for 200K requests/s with text embeddings, session context, and image embeddings?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 4}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2984", "title": "Feature Store: Optimize Feature Computation Pipeline for H100 Training Throughput", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you optimize the 32-GPU nightly feature pipeline from 6 hours to under 2 hours when 70% of time is in 5 aggregations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2985", "title": "Feature Store: Realize Feature Store Sizing for Production ML Platform", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What complete feature store infrastructure would you build for 10 model families, 500M users, 1M catalog items, and 500K requests/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2986", "title": "Feature Store: Specification — Define Feature Freshness SLA for Real-Time Fraud Detection", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What feature freshness SLAs, latency budgets, and monitoring would you set for the 50K TPS fraud system's three feature tiers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2987", "title": "Kernel Fusion: Recall — What is Operator Fusion and Why Does It Matter?", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is kernel fusion, which HBM bottleneck does it address, and what bandwidth-saving formula applies?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2988", "title": "Kernel Fusion: Design a Fused Attention Kernel for H100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For the 4096x64x128 attention layer, how large are Q, K, V and the full score matrix, and why is FlashAttention tiling mandatory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2989", "title": "Kernel Fusion: Evaluate FlashAttention vs Standard Attention on H100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 1K, 4K, and 16K context lengths, how do FlashAttention-2 and standard attention differ in memory use, throughput, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2990", "title": "Kernel Fusion: Evaluate Fused vs Unfused MLP Blocks", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which parts of the 13B MLP block are compute-bound versus memory-bound, and how much does fusion help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2991", "title": "Kernel Fusion: Implement Fused LayerNorm + Linear on H100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For batch=1 decode, how much HBM traffic does fusing LayerNorm + Linear save for hidden size 4096, and what latency gain should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2992", "title": "Kernel Fusion: Mastery — Design Fusion Strategy for High-Throughput LLM Prefill", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the top fusion opportunities to improve the 38% MFU for 70B LLM prefill on 8 GPUs, and what is the expected MFU improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2993", "title": "Kernel Fusion: Mastery — Fusion Impact Analysis for Autoregressive Decode", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 7B batch=1 decode, what fusion strategy is useful, and are the gains from bandwidth reduction or kernel-launch overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2994", "title": "Kernel Fusion: Optimize CUDA Kernel Fusion for Transformer LayerNorm", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much HBM bandwidth does a fused single-pass LayerNorm save versus a naive three-pass LayerNorm for the 30B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2995", "title": "Kernel Fusion: Optimize Fusion Strategy for Multi-Query Attention", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do fused and unfused MQA kernels differ in decode mode for the 13B LLM, and how much KV-cache bandwidth does MQA save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2996", "title": "Kernel Fusion: Realize Kernel Fusion Gains for 70B LLM Inference", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the correct per-layer attention FLOP count, and why is the 8.59 TFLOPs estimate fundamentally flawed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2997", "title": "Kernel Fusion: Realize Fusion ROI Analysis for Production LLM Serving", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the break-even time for implementing kernel fusion on this LLM serving system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2998", "title": "Kernel Fusion: Specification — Define Fusion Requirements for Custom CUDA Kernel", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What correctness, performance, precision, and validation requirements would you set for the fused RMSNorm + QKV CUDA kernel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2999", "title": "KV-Cache: Design PagedAttention KV Cache for Multi-Tenant LLM Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a PagedAttention KV cache for 100 concurrent 128-4096-token requests on 8 H100s to maximize GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3001", "title": "KV-Cache: Evaluate KV Cache Quantization Tradeoffs on H100", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do FP16, INT8, and INT4 KV caches compare for HBM usage, decode throughput, and quality on this 13B workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3002", "title": "KV-Cache: Evaluate PagedAttention vs Continuous Batching Scheduling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate GPU utilization, memory waste, and throughput for static preallocation versus PagedAttention continuous batching for this mixed workload.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3003", "title": "KV-Cache: Implement KV Cache Size Calculation for LLM Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the per-GPU HBM allocation for model weights, KV cache, and activations, and how many tokens can be cached simultaneously?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3004", "title": "Design KV Cache Infrastructure for 100K RPS LLM Platform", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What complete KV-cache strategy would you use to serve 100K RPS with 256-token inputs and 128-token outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3005", "title": "KV-Cache: Mastery — KV Cache Memory Budget Optimization for Long-Context Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you budget HBM to support at least 10 concurrent 32K-context requests for the 13B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3006", "title": "KV-Cache: Optimize KV Cache Eviction Policy for LLM Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What KV-cache memory per token should you use when designing the eviction policy for this 7B GQA-8 workload, and how does it inform the eviction strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3007", "title": "KV-Cache: Realize KV Cache Memory Layout for Tensor-Parallel 70B Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the 70B KV cache be laid out per GPU with TP=8 and 8 KV heads, and what capacity does it provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3008", "title": "KV-Cache: Realize Cross-Request KV Cache Sharing for RAG Applications", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV-cache memory per token should the shared RAG document cache budget for the 32-layer GQA-8 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3009", "title": "KV-Cache: Specification — Define KV Cache SLAs for Production LLM API", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What KV-cache size per token and per 4K-context request should the production 7B API budget to meet these requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3010", "title": "KV-Cache: Specification — KV Cache Budget for Multi-Model Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What per-token KV-cache size should the single-H100 multi-model setup budget for its GQA-8 models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3011", "title": "KV-Cache: Specification — KV Cache Sizing for Speculative Decoding", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What per-token KV-cache budget is required for the 70B target model in the speculative decoding setup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3012", "title": "Dataset Curation: Evaluate Training Data Quality Metrics for LLM Fine-Tuning", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which metric better predicts downstream task performance: instruction-following rate or semantic diversity score?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3013", "title": "Dataset Curation: Realize Data Versioning Infrastructure for Continuous LLM Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the data versioning system to support weekly data arrivals, reproducible runs, rollback, A/B mixes, and audit trails?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3014", "title": "Latency Decomposition: Compare Batched vs. Streaming Inference on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do batched serving and individual continuous batching compare in end-to-end latency for 512-token prefill and 256-token decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3015", "title": "Latency Decomposition: Compare KV-Cache vs. No-Cache Decode Latency on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At sequence length 1024, how does per-token decode latency compare with and without a KV cache for the 7B model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3016", "title": "Latency Decomposition: Full-Stack E2E Latency Budget for LLM API on H100 Cluster", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you budget the P99 latency components, identify the dominant term, and optimize to fix an SLA miss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3017", "title": "Latency Decomposition: Diagnose and Fix Decode Latency Regression on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did memory utilization jump to 89% after moving from 13B to 20B, and how would you quantify the fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3018", "title": "Latency Decomposition: Size the Per-Component Latency for a 70B Model Serving Request", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What NVLink bandwidth and all-reduce overhead should be included in the 4-GPU tensor-parallel latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3019", "title": "Latency Decomposition: Specify Prefill/Decode Split Requirements for SLA on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What prompt length, decode length, GPU count, and batch-size constraints satisfy TTFT < 200ms and TPOT < 30ms for the 13B chat app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3020", "title": "MLOps Lifecycle: Design a CI/CD Pipeline for LLM Fine-Tuning", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What CI/CD pipeline would support nightly 7B fine-tuning, benchmark evaluation, 5% canaries, rollback under 5 minutes, and what are its storage and runtime budgets?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3021", "title": "MLOps Lifecycle: Compare MLflow vs. Weights & Biases for LLM Experiment Tracking at Scale", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do self-hosted MLflow and W&B compare on operational overhead, storage cost, and best-run query latency for 1000 experiments?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3022", "title": "MLOps Lifecycle: Compare Blue-Green vs. Rolling Deployment for LLM API on H100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the correct extra cost of an on-demand fallback for the 16-GPU deployment for a one-hour interruption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3023", "title": "MLOps Lifecycle: Implement Model Registry Versioning Strategy for Multi-Region LLM", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design the model registry schema and cross-region replication strategy to deduplicate 13B artifacts and allow <5-minute rollback?", "chain_ids": ["cloud-chain-auto-011-09"], "chain_positions": {"cloud-chain-auto-011-09": 0}, "chain_tiers": {"cloud-chain-auto-011-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3024", "title": "MLOps Lifecycle: End-to-End MLOps System Design for Production LLM on H100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "In a serving capacity model for 10M requests/day, what does the value 57.9 represent?", "chain_ids": ["cloud-chain-auto-011-09"], "chain_positions": {"cloud-chain-auto-011-09": 2}, "chain_tiers": {"cloud-chain-auto-011-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3025", "title": "MLOps Lifecycle: Concretely Size Model Registry Storage for a Large ML Organization", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much registry storage is needed for 200 experiments per week over a year, and what tiering strategy minimizes cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3026", "title": "MLOps Lifecycle: Specify a Model Evaluation Gate for Production Promotion on H100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What quantitative promotion thresholds should automatically gate a new 13B checkpoint for benchmark, latency, memory, quality, and errors?", "chain_ids": ["cloud-chain-auto-011-09"], "chain_positions": {"cloud-chain-auto-011-09": 1}, "chain_tiers": {"cloud-chain-auto-011-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3027", "title": "H100 NVLink Bandwidth for Tensor Parallel Inference", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For H100 tensor parallel inference, what NVLink bandwidth should you use when estimating all-reduce communication overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3028", "title": "Model Size Estimation: Design Sharding Strategy for 405B Model on H100 Cluster", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What H100 NVLink bandwidth should the 405B sharding analysis use, and why is 600GB/s the wrong value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3029", "title": "Model Size Estimation: Design Quantization Strategy for Fitting 70B on Single H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What quantization and memory layout would fit a 70B LLaMA model on one 80GB GPU for low-latency inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3030", "title": "Comparing GPT-3 and LLaMA-2 Memory Footprints on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do GPT-3 175B and LLaMA-2-70B compare in inference memory for weights, KV cache, and activations at batch 16 and seq 2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3031", "title": "Model Size Estimation: Compare MoE vs. Dense Memory Footprint", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does Mixtral-8x7B inference memory compare with dense LLaMA-2-13B at batch 32 and sequence length 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3032", "title": "Model Size Estimation: Fluency \\u2014 Estimate 7B Model Memory From Scratch", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory does a 7B parameter model need at FP16 for inference with a batch of 8, generating up to 512 tokens?", "chain_ids": ["cloud-chain-auto-secondary-005-07"], "chain_positions": {"cloud-chain-auto-secondary-005-07": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3033", "title": "Model Size Estimation: Master Full Memory Budget for LLM Training on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the full GPU memory budget including weights, gradients, optimizer states, and activations, and does it fit on a single H100?", "chain_ids": ["cloud-chain-auto-secondary-005-07"], "chain_positions": {"cloud-chain-auto-secondary-005-07": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3034", "title": "Model Size Estimation: Master Attention KV-Cache Scaling Laws on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the KV-cache memory per token, max context capacity, and how does GQA (8 KV heads) change this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3035", "title": "Model Size Estimation: Diagnose OOM Error During Fine-Tuning", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 13B fine-tuning run OOM, and what minimum changes make it fit in 80GB?", "chain_ids": ["cloud-chain-auto-secondary-005-07"], "chain_positions": {"cloud-chain-auto-secondary-005-07": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3036", "title": "Model Size Estimation: Diagnose KV-Cache Memory Leak on H100 Serving System", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing memory to grow from 35GB to 78GB over 4 hours, and how long until the 80GB H100 OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3037", "title": "Model Size Estimation: Realize Full Memory Layout for 13B Model Serving on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size the H100 memory budget for 13B LLaMA-2 FP16 serving and compute max concurrency at 1024 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3038", "title": "Model Size Estimation: Realize FP8 vs. FP16 Memory Comparison for 70B on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much H100 memory does a 70B LLaMA model need in FP16 vs FP8, and what KV-cache budget remains on 1, 2, and 4 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3039", "title": "Model Size Estimation: Realize Memory Budget for Multi-Modal LLM on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does LLaVA-13B fit on one 80GB H100 at batch 8 and 4096 context, and what is the maximum batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3040", "title": "Model Size Estimation: Specify Memory-Efficient Serving for 70B on 2-H100 Budget", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Can a 70B model prefill a 512-token prompt in 37 ms on an H100, and what is the correct compute-based estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3041", "title": "Network Bandwidth Bottlenecks: Design NVLink vs. PCIe Topology for 8xH100 Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "For TP-2 and DP-4 training on an 8xH100 DGX, which traffic uses NVLink versus PCIe, and how much bandwidth does each consume?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 3}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3042", "title": "Network Bandwidth Bottlenecks: Compare PCIe 4.0 vs. 5.0 Impact on H100 Data Loading", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the actual input tensor size for batch 256 ImageNet 224x224 RGB float32 images, and why is an estimate of 3.3GB too high?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 2}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3043", "title": "Network Bandwidth Bottlenecks: Compare AllReduce over NVLink vs. InfiniBand", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 13B model with 26GB FP16 gradients, how long does the AllReduce take with NVLink versus InfiniBand, and what is the efficiency ratio?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 1}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3044", "title": "Network Bandwidth Bottlenecks: Recall PCIe and NVLink Bandwidth Specs", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the exact bidirectional bandwidth specifications for PCIe 5.0, NVLink 4.0, and NVSwitch, and when does each limit performance?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 0}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3045", "title": "Network Bandwidth Bottlenecks: Implement Ring-AllReduce Time Formula for H100 Cluster", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact Ring-AllReduce time for the 13B model on 8 H100s, and is it compute-bound or communication-bound if step time is ~100ms?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 0}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3046", "title": "Network Bandwidth Bottlenecks: Master Full Communication Analysis for 3D-Parallel LLM Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the specific TP, PP, and DP communication costs per step, and which dominates the 3D-parallel setup?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 3}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3047", "title": "Network Bandwidth Bottlenecks: Diagnose Bandwidth Bottleneck in Multi-Node Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 32-GPU 7B data-parallel job, is NVLink or 200Gb/s InfiniBand the bottleneck, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3048", "title": "Network Bandwidth Bottlenecks: Realize AllReduce Data Volume for 70B Model Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 70B training on 64 H100s using TP-8 and DP-8, how much gradient data does each GPU AllReduce and how long does it take over 200Gb/s InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3049", "title": "Network Bandwidth Bottlenecks: Realize PCIe Bandwidth Impact on Model Checkpointing", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does checkpointing a 70B FP16 model from 8 GPUs through PCIe 5.0 to NVMe take, and can it be hidden during training?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 1}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3050", "title": "Network Bandwidth Bottlenecks: Specify InfiniBand Fabric for 256-GPU LLM Training Cluster", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What InfiniBand topology, rail count, per-GPU bandwidth, AllReduce target, and bisection bandwidth would you use for a 256-GPU 70B+ training cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3051", "title": "Network Bandwidth Bottlenecks: Fluency — AllReduce Bandwidth in 60 Seconds", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the AllReduce take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3052", "title": "Network Bandwidth Bottlenecks: Master PCIe+NVLink Communication Overlap Strategy", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What communication schedule and optimizations are needed for 13B training on 16 GPUs across two nodes to reach over 90% GPU utilization?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 2}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3053", "title": "Model Size Estimation: Realize Full Training Memory Budget for 7B Model on 4×H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the per-GPU memory budget for fine-tuning a 7B model on 4 GPUs with ZeRO-2, and does it fit in 80GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3054", "title": "Pipeline Parallelism Bubble Overhead Comparison: GPipe vs PipeDream", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 4-stage pipeline with 16 micro-batches, what are the GPipe and PipeDream-Flush bubble overheads, and which should train a 32B model?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 2}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3055", "title": "Pipeline Parallelism Bubble Overhead Comparison: Stage Count Tradeoff", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which configuration is better for throughput (4-stage or 8-stage), and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3056", "title": "Implement Pipeline Parallelism Bubble Fraction Formula", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For p=8 pipeline stages, how many micro-batches keep bubble overhead below 5%, and what GPipe activation memory does that create per H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3057", "title": "Pipeline Parallelism Mastery: End-to-End Training System Design", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you choose TP, PP, and DP degrees for a 530B model on 512 H100s, and what are the resulting bubble overhead and memory per device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3058", "title": "Pipeline Parallelism Optimization: Diagnosing Bubble Waste and Fixing", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the 8-stage GPipe job with m=8 only 71% utilized, and how much would 1F1B with m=64 improve utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3060", "title": "Pipeline Parallelism Realization: Memory per Stage for LLaMA-70B", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 8-stage pipeline parallel LLaMA-70B training on 80GB H100s with DP=1, how much memory does each stage need, does it fit, and what headroom remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3061", "title": "Pipeline Parallelism Realization: Activation Memory with Gradient Checkpointing", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much activation memory is saved by checkpointing per stage, and does it fit in an 80GB H100 without it?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 1}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3062", "title": "Pipeline Parallelism Recall: GPipe Bubble Overhead Formula", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the GPipe bubble fraction formula, its variables, its m→∞ limit, and the value for p=4, m=16 on H100s?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 0}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3063", "title": "Pipeline Parallelism Specification: Design for <10% Bubble with Memory Constraint", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What TP, PP, DP, and micro-batch settings keep a 175B GPT-style model under 70GB per GPU with pipeline bubble below 10%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3064", "title": "Pipeline Parallelism Specification: Inter-Stage Communication Budget", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "For a 16-stage H100 pipeline training a 1T model, what activation and gradient volume crosses each stage boundary and is 400Gb/s InfiniBand sufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3065", "title": "Pipeline Parallelism Fluency: Rapid Bubble Estimation", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What are the estimates for the bubble fraction (p=8, m=56), the micro-batch count for <5% bubble (p=4), and the throughput lost to a 12% bubble on a 500 TFLOP/s cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3066", "title": "Queueing Theory Recall: Little's Law in Inference Systems", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the throughput of a server with 50 concurrent requests and a 2-second service time according to Little's Law, and how does it change if latency doubles?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 0}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3067", "title": "Queueing Theory Analyze: Why Tail Latency Explodes at High Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At 89% inference utilization, how does M/M/1 queueing explain P99 latency far above P50, and what happens at 95% utilization?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 2}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3068", "title": "Queueing Theory Design: Autoscaling Policy for P99 SLO", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For mean service time 2.5s and P99 SLO 8s, what safe utilization, scale trigger, and fleet size are needed for 1000 req/s under M/M/c queueing theory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3069", "title": "Queueing Theory Design: Multi-Server M/M/c for Inference Cluster", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Determine the minimum c to achieve P99 < 200ms, and compare to the throughput and latency of a naive single-server design.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3070", "title": "Queueing Theory Evaluation: FCFS vs Priority Queuing for Mixed Workloads", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Compare FCFS queuing versus preemptive priority queuing (short requests preempt long) for P99 latency, and show which achieves better aggregate SLO compliance.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3071", "title": "Queueing Theory Evaluation: Continuous vs Batch Inference Latency", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What H100 FP16 TFLOPS specification should be used for LLaMA-13B serving throughput calculations, and how does continuous batching improve latency over static batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3072", "title": "Queueing Theory Evaluation: M/M/1 vs M/D/1 for Deterministic Service", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the correct M/D/1 mean queue wait formula and value at 80% utilization for 100 ms deterministic service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3073", "title": "Queueing Theory Fluency: M/M/1 Metrics from Memory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For an M/M/1 queue at ρ=0.95 and μ=40 req/s, what is the correct Wq formula and wait time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3074", "title": "Queueing Theory Implement: Erlang-C for Inference Cluster Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using Erlang-C with λ=200 req/s and μ=100 req/s per server, how many servers are needed for P99 < 500ms and what is P(wait>0)?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 2}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3075", "title": "Queueing Theory Mastery: Inference System Capacity Planning End-to-End", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How many 4-GPU pods are required to support the 5000 req/s workload, and what is the estimated hourly cluster cost at $2/hr per GPU?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 3}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3076", "title": "Queueing Theory Mastery: Speculative Decoding Impact on Queue Dynamics", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does speculative decoding change M/M/1 to M/G/1 queueing for LLaMA-70B at ρ=0.8, and what P99 latency improvement should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3077", "title": "Queueing Theory Optimization: Reduce P99 by 3x Without Adding Hardware", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which no-new-hardware operational changes would bring P99 below 2s, and how much P99 improvement does each provide?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3078", "title": "Queueing Theory Optimization: Tail Latency via Load Shedding", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Using M/M/1/K analysis, what queue-depth threshold K achieves P99 < 1s while maximizing accepted throughput, and what fraction of requests are rejected?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3079", "title": "Queueing Theory Realization: Size an H100 Inference Queue for Black Friday Traffic", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many H100s are needed for the 2000 req/s Black Friday burst, what does the 30-minute burst cost, and should you pre-warm capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3080", "title": "Queueing Theory Realization: Queue Memory Sizing for KV Cache Pooling", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With L=20, how much KV cache memory is needed per request and what maximum active concurrency fits before OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3081", "title": "Queueing Theory Realization: Compute vs Memory Bottleneck in Queued Requests", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is an H100 serving Mistral-7B decode compute-bound or memory-bound, and how does this dictate using an M/D/1 instead of an M/M/1 queueing model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3082", "title": "Queueing Theory Specification: Design a Latency SLO Budget", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you allocate the 500ms P99 latency budget across the load balancer, router, and GPU tiers, and size each M/M/c tier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3083", "title": "Systolic Array Analyze: Why TPU v5e Outperforms GPU for Matrix Multiply", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a TPU v5e often outperform an H100 for BERT-large inference despite a 5x lower peak FLOP rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3084", "title": "Systolic Array Design: Weight-Stationary vs Output-Stationary Tradeoff", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "For the 2048-token attention layer Q*K^T on a 256x256 systolic array, should you use weight-stationary or output-stationary dataflow, and why?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 3}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3085", "title": "Systolic Array Design: Tiling Strategy for Large Matrix on TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tile sizes, tile counts, memory transfers, and total bandwidth are needed for the 16384x16384 FFN GEMM on a 128x128 array?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3086", "title": "Systolic Array Evaluation: TPU v5e vs H100 for Training Transformers", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the H100 FP16 ridge point, and when do decode and prefill become compute- versus memory-bound based on arithmetic intensity?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3087", "title": "Systolic Array Evaluation: Batched vs Streaming Inference on TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For ResNet-50 on TPU v5e, what are throughput and latency for batch=128 versus batch=1, and which mode fits interactive vs batch serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3088", "title": "Systolic Array Fluency: Arithmetic Intensity from Memory", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the arithmetic intensities for the GEMM, ReLU, and QK^T cases, and which are compute- or memory-bound?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3089", "title": "Systolic Array Implement: GEMM Performance on TPU v5e", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the GEMM time and bottleneck, the MFU at 8.5ms measured time, and the bandwidth impact of 4-way tiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3090", "title": "Systolic Array Mastery: Full Transformer Layer Analysis on TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do the attention and FFN FLOPs and bandwidth break down for this transformer layer, what is the bottleneck, and what tile/dataflow should XLA use?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 4}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3091", "title": "Systolic Array Mastery: Roofline Analysis for Custom LLM Kernel", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the 12 TFLOPS decode kernel compute- or memory-bound, what is the attainable performance, and which optimization gives the largest gain?", "chain_ids": ["cloud-chain-auto-secondary-010-24"], "chain_positions": {"cloud-chain-auto-secondary-010-24": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3092", "title": "Systolic Array Optimization: Fix Low MFU on TPU v5e", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is BERT-large at only 7.6% MFU on TPU v5e, and how much speedup should fusing GeLU, LayerNorm, and Linear deliver?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3093", "title": "Systolic Array Realization: Size Systolic Array for Attention Layer", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory and compute time does this attention layer need, and how many 128x128 systolic-array tiles are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3094", "title": "Systolic Array Realization: Memory Layout for TPU Batch Inference", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which weight layout should TPU v5e use for the 8192x32768 feed-forward layer, and how much does layout affect bandwidth and performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3095", "title": "Systolic Array Specification: Design for 99% Compute Utilization on TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What batch * sequence size is needed for the FFN layer to be compute-bound at ≥99% utilization on TPU v5e, and how do you derive it?", "chain_ids": ["cloud-chain-auto-secondary-010-24"], "chain_positions": {"cloud-chain-auto-secondary-010-24": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3096", "title": "TCO Analyze: Why Cloud GPUs May Be Cheaper Than On-Prem for Startups", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "At what utilization does cloud become cheaper?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 2}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3097", "title": "TCO Design: Spot vs On-Demand vs Reserved Instance Strategy", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What fleet mix of Spot, On-Demand, and Reserved H100s meets 90% availability for 1000 H100-hours/day, and what is the effective cost per H100-hour?", "chain_ids": ["cloud-chain-auto-018-03"], "chain_positions": {"cloud-chain-auto-018-03": 1}, "chain_tiers": {"cloud-chain-auto-018-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3098", "title": "TCO Design: Cost Per Inference for Production API Service", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Calculate: cost per 1M tokens, revenue required per 1M tokens to achieve 40% gross margin, and how INT4 quantization (fit on 2 GPUs) changes the economics.", "chain_ids": ["cloud-chain-auto-018-02"], "chain_positions": {"cloud-chain-auto-018-02": 2}, "chain_tiers": {"cloud-chain-auto-018-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3099", "title": "TCO Evaluation: H100 vs TPU v5e for Training Cost", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the training time, compute cost, and cost per effective FLOP for H100 vs TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3100", "title": "TCO Evaluation: Cost of Quantization for Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the cost per 1M tokens for FP16, INT8, and INT4 LLaMA-13B, and which quantization is the best cost-quality tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3101", "title": "TCO Mastery: Build vs Buy Decision for LLM Training Infrastructure", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct annual power cost for a 256-GPU cluster at 700W per GPU, $0.08/kWh, and PUE 1.3?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 4}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3102", "title": "TCO Mastery: Carbon Cost and Sustainable AI Infrastructure", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the operational carbon and offset cost for GPT-3 training on 8 H100s at 45% MFU, and how do coal versus renewable power sources compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3103", "title": "TCO Optimization: Rightsize GPU Fleet for Inference Workload", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much annual GPU spend is wasted by keeping 150 GPUs on 24/7, and what autoscaling plan would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3104", "title": "TCO Optimization: Training Efficiency vs Infrastructure Cost", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the break-even total infrastructure budget required to justify the $60K engineering investment, and what are the ongoing cost savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3105", "title": "TCO Realization: Concretely Size H100 Cluster Budget for Startup", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What H100 FP16 throughput and 8-GPU aggregate FLOPs should you use when allocating the $500K ML infrastructure budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3106", "title": "TCO Realization: Annual GPU Cost for GPT-4-Scale Service", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GPT-4 serving pods and GPUs are needed for 100M queries/day, and what are the annual and monthly GPU costs?", "chain_ids": ["cloud-chain-auto-018-02"], "chain_positions": {"cloud-chain-auto-018-02": 1}, "chain_tiers": {"cloud-chain-auto-018-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3107", "title": "TCO Specification: Design Cost-Optimal Inference for SLO", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the minimum-cost fleet mix of FP16 and INT4 pods to meet the throughput and latency SLOs, and what is its cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3108", "title": "TCO Specification: Design Multi-Region Inference Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should you allocate pods across US, EU, and APAC for 500 req/s, what is the annual cost, and why not centralize in US-East?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3109", "title": "Transformer Cost Design Optimal Architecture for Inference Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What model configuration and H100 fleet meet P99<500ms, 100 req/s, and <$0.001/request starting from GPT-2 large?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 3}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3110", "title": "Transformer Cost Evaluation: GPT-2 vs LLaMA-7B Inference Cost", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the decode throughput, request latency, and cost per 1M tokens for GPT-2 large BF16 versus LLaMA-7B INT4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3111", "title": "Transformer Cost Evaluation: Scaling Law Cost Prediction", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using Chinchilla C=6ND, which is more cost-efficient for the same loss target: 70B on 1.4T tokens or 7B on 1T tokens, and what are the costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3112", "title": "Transformer Cost Fluency: FLOPs Estimation from Memory", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 6ND GPT-3 training FLOPs, 70B inference FLOPs per token, and H100-days to train GPT-3 at 40% MFU?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 1}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3113", "title": "Transformer Cost Implement: Chinchilla Optimal Model Size Calculation", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a $100K budget, what Chinchilla-optimal model size and token count should you train, and how does it compare to a 7B run?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 2}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3114", "title": "Transformer Cost Mastery: Full Training and Inference Cost Analysis for LLM Product", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are the training cost, inference fleet cost at 1000 req/s, and break-even monthly revenue for the 30B LLM product?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 4}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3115", "title": "Transformer Cost Optimization: Reduce Inference Cost 5x via Speculative Decoding", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What speedup, cost reduction, and memory overhead does speculative decoding with a 7B draft model provide for the 70B INT4 service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3116", "title": "Transformer Cost Realization: Concrete FLOP Count for BERT Training", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are BERT-base training FLOPs by 6ND and by layer-by-layer counting, and how do the two estimates compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3117", "title": "Transformer Cost Realization: Size Activation Memory for LLM Training", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the peak activation memory with and without checkpointing, and is model parallelism required on an 80GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3118", "title": "Analyze GPipe Bubble Overhead in Pipeline Parallelism", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does reducing from 16 to 8 micro-batches affect the pipeline bubble ratio and throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3119", "title": "Design 1F1B Pipeline Schedule for LLM Training on H100 Cluster", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What 1F1B schedule would you use, and what are its steady-state throughput and bubble ratio versus GPipe?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 3}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3120", "title": "Design Interleaved Pipeline Schedule for Reduced Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "With v=2 interleaving, what are the new bubble ratio, steady-state efficiency, and added communication cost?", "chain_ids": ["cloud-chain-auto-023-16"], "chain_positions": {"cloud-chain-auto-023-16": 2}, "chain_tiers": {"cloud-chain-auto-023-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3121", "title": "Diagnose Load Imbalance Across Pipeline Stages on H100", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 4-stage pipeline imbalance, and how would you rebalance the stages to recover throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3122", "title": "Diagnose Head-of-Line Blocking in LLM Serving Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the 4.2s P99 latency at 60% utilization, and how would you fix the scheduler?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3123", "title": "Recall Systolic Array Weight-Stationary Dataflow", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In a weight-stationary 256x256 systolic array, what data stays stationary, how many times are weights read for a batch of 64 inputs, and how does this compare to naive execution?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3124", "title": "Diagnose Systolic Array Underutilization for Non-Square Matrices", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does TPU v5e utilization collapse to 0.4% at batch size 1, and what minimum batch gives over 50% utilization?", "chain_ids": ["cloud-chain-auto-secondary-010-24"], "chain_positions": {"cloud-chain-auto-secondary-010-24": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3125", "title": "Recall TCO CapEx vs OpEx Split for GPU Cloud Training", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the fully-loaded hourly cost per GPU including CapEx amortization and OpEx?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3126", "title": "Implement Cost Allocation Per Training Run on Shared H100 Cluster", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate per-job costs, allocate shared overhead, and attribute idle time on the 64-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3127", "title": "Diagnose TCO Anomaly from Checkpoint I/O Costs on Cloud", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is checkpoint frequency or storage retention the primary driver of the $180K storage and egress bill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3128", "title": "TCO Fluency: Compute Cost per Token at Scale", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the serving cost per 1M output tokens for GPT-3 175B on 4 GPUs at $2.50/GPU-hour?", "chain_ids": ["cloud-chain-auto-018-02"], "chain_positions": {"cloud-chain-auto-018-02": 0}, "chain_tiers": {"cloud-chain-auto-018-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3129", "title": "Recall Transformer Inference FLOPs Formula", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the standard FLOPs approximation formulas for inference and training, and how do they scale with sequence length?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 0}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3130", "title": "Analyze Chinchilla Compute-Optimal Training Budget on H100", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What compute-optimal model size and token count does a $5M H100 training budget buy, and how does it compare to GPT-3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3131", "title": "Diagnose Memory-Bandwidth-Bound Decode on H100 Serving Cluster", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is batch-1 decode for the 70B INT8 model compute-bound or memory-bandwidth-bound, and why is throughput about 52 tokens/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3132", "title": "Specify KV Cache Memory Requirements for Long-Context LLM Serving", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What KV cache size, total VRAM need, remaining weight budget, and minimum GPU count are required for 32 requests at 128K context?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 2}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3133", "title": "FlashAttention Tile Size vs SRAM Capacity", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a model with head_dim=128 in FP16, what is the maximum tile size (B_r x B_c) that fits in SRAM, and why does exceeding it force spills to HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3134", "title": "FlashAttention IO Complexity vs Standard Attention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Compare the HBM IO bytes of standard attention vs FlashAttention, and estimate the wall-clock difference on a 2 TB/s GPU.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3135", "title": "PagedAttention Memory Fragmentation", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV-cache VRAM is wasted per GPU by preallocating 4096 tokens when the average length is 512, and how does PagedAttention eliminate it?", "chain_ids": ["cloud-chain-auto-014-04"], "chain_positions": {"cloud-chain-auto-014-04": 1}, "chain_tiers": {"cloud-chain-auto-014-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3136", "title": "Implementing FlashAttention's Online Softmax", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What online-softmax rescaling rule should FlashAttention use when a new K tile raises the row max, and what FLOP overhead does it add?", "chain_ids": ["cloud-chain-auto-014-06"], "chain_positions": {"cloud-chain-auto-014-06": 1}, "chain_tiers": {"cloud-chain-auto-014-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3137", "title": "Diagnosing FlashAttention Regression on Short Sequences", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can FlashAttention-2 be slower than standard attention at seq_len=64 but much faster at seq_len=2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3138", "title": "Specifying KV Cache Budget for PagedAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What PagedAttention block size, block budget, and memory reservation would you set for 512 requests at max_seq_len=8192 on 4 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3139", "title": "FlashAttention Arithmetic Intensity Calculation", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of standard attention versus FlashAttention, and does FlashAttention shift it to being compute-bound?", "chain_ids": ["cloud-chain-auto-014-02"], "chain_positions": {"cloud-chain-auto-014-02": 1}, "chain_tiers": {"cloud-chain-auto-014-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3140", "title": "Evaluating Ring Attention vs FlashAttention for 128K Context", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 128K context on 8 H100s, how do FlashAttention-2 with Ulysses and Ring Attention compare in communication, memory, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3141", "title": "Designing a Prefix Caching Strategy for RAG", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design prefix caching for the 50 shared 2048-token prompts, and what memory cost, hit rate, and TTFT gain result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3142", "title": "Optimizing FlashAttention Tile Size for MI300X", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does MI300X's 64KB LDS change FlashAttention tile size for head_dim=128 FP16, and what throughput impact should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3143", "title": "Full-Stack FlashAttention Deployment Decision", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the complete attention stack, KV cache management, sequence parallelism, and prefix caching strategy, and why is 810 TFLOPs per token a unit error?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3144", "title": "Sliding Window Attention Memory Savings", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much KV cache does 32K context use with 16 sliding-window and 16 full-attention layers versus full attention on all 32 layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3146", "title": "KV Cache Quantization with PagedAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What KV-cache quantization strategy would let the 4-GPU 70B service reach 500 concurrent requests, and what quality tradeoff does it make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3147", "title": "FlashDecoding for Long-Context Decode", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much speedup should FlashDecoding provide at 32K context, and why does splitting the KV sequence across blocks help?", "chain_ids": ["cloud-chain-auto-014-07"], "chain_positions": {"cloud-chain-auto-014-07": 1}, "chain_tiers": {"cloud-chain-auto-014-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3148", "title": "Chunked Prefill Scheduling with FlashAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "With a 50ms decode SLO and 30ms prefill budget on 8 H100s, what chunk size should chunked prefill use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3149", "title": "PagedAttention Block Size Tradeoff", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did switching PagedAttention from block_size=16 to block_size=1 improve memory utilization but increase decode latency by 35%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3150", "title": "GQA-Aware FlashAttention Kernel Design", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV bandwidth does a naive GQA FlashAttention implementation waste, and how would you tile to reuse each KV head across 8 query heads?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3151", "title": "FlashAttention Backward Pass Memory Savings for Training", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much activation memory does FlashAttention save by recomputing attention for seq_len=8192, and does it enable a larger batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3152", "title": "Multi-Query Attention vs GQA with FlashAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do MQA and GQA-8 compare for decode throughput, KV cache size, and quality at batch=64 and seq_len=4096?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3153", "title": "FlashAttention FP8 on H100 Transformer Engine", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What throughput speedup should FP8 FlashAttention-3 deliver over FP16 at seq_len=8192, and when does FP8 attention hurt quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3154", "title": "PagedAttention Copy-on-Write for Beam Search", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How much KV cache memory does naive beam_width=4 waste at 2048 tokens, and how does PagedAttention copy-on-write avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3155", "title": "FlashAttention Causal Masking Efficiency", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What fraction of 128×128 FlashAttention tiles are skipped by the causal mask at seq_len=4096, and what speedup results for prefill and decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3156", "title": "Ring Attention Communication-Computation Overlap Budget", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is the 400 Gb/s InfiniBand link a bottleneck for Ring Attention across 16 GPUs at 256K context, and what ring ordering should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3157", "title": "Prefix Caching Eviction Policy Design", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which eviction policy (LRU, LFU, ARC) maximizes the KV cache hit rate for a power-law prompt distribution, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3158", "title": "FlashAttention-3 Asynchronous Softmax Pipelining", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For seq_len=8192, head_dim=128, and tile_size=128 on H100, what pipeline efficiency and speedup does FlashAttention-3 achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3159", "title": "FlashAttention Numerical Stability in Mixed Precision Training", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why must FlashAttention keep row max and log-sum-exp in FP32 on H100, and what fails if they are kept in FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3160", "title": "Speculative Decoding Acceptance Rate Fundamentals", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "If the acceptance rate is alpha=0.8 per token, what is the expected number of accepted tokens per speculation round, and what is the effective speedup over autoregressive decoding?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 2}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3161", "title": "Speculative Decoding Throughput Degradation Under Load", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can K=5 speculative decoding with a 7B draft reduce throughput by 15% for 200 concurrent requests?", "chain_ids": ["cloud-chain-auto-025-11"], "chain_positions": {"cloud-chain-auto-025-11": 1}, "chain_tiers": {"cloud-chain-auto-025-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3162", "title": "Designing a Draft Model Selection Strategy", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using E = (1-α^(K+1))/(1-α), what are the expected tokens generated per step for option A at K=7 and option C at K=6?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3163", "title": "Implementing Rejection Sampling for Lossless Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does rejection sampling preserve the target distribution, and what expected acceptance rate and correction cost apply at KL(p||q)=0.5 nats?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3164", "title": "Diagnosing Speculation Failure on Code Generation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At alpha=0.45 and K=5, how many tokens per speculation step are expected, and can this make code generation slower than baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3165", "title": "Specifying Speculation Length K vs Draft Model Size", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At 80 ms target decode, 3 ms/token draft cost, and alpha=0.78, does K=5 or K=15 give higher throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3166", "title": "Speculative Decoding Memory Bandwidth Analysis", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What effective memory-bandwidth amplification does K=5 speculative decoding with alpha=0.80 provide over batch-1 autoregressive decode?", "chain_ids": ["cloud-chain-auto-025-10"], "chain_positions": {"cloud-chain-auto-025-10": 0}, "chain_tiers": {"cloud-chain-auto-025-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3167", "title": "Tree-Structured Speculation vs Linear Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which produces more accepted tokens per verification round, and what is the verification cost difference?", "chain_ids": ["cloud-chain-auto-025-10"], "chain_positions": {"cloud-chain-auto-025-10": 1}, "chain_tiers": {"cloud-chain-auto-025-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3168", "title": "Designing Medusa Heads for Self-Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the Medusa heads for a 70B target with hidden_dim=8192 on 4x H100, and what is their overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3169", "title": "Speculative Decoding KV Cache Management", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If draft tokens 1-3 are accepted and token 4 is rejected, which target KV cache entries are kept, discarded, and how is the correction KV stored?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3170", "title": "Diagnosing Speculation Latency Regression at High Temperature", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does alpha fall from 0.82 at temperature 0.1 to 0.55 at temperature 1.0, and how would you fix the slowdown?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3171", "title": "Draft Model Placement on Multi-GPU Inference", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For a 405B target on 8 GPUs, should the 7B draft run with TP=8, on one of the 8 GPUs, or on a separate 9th GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3172", "title": "Speculative Decoding with Continuous Batching", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should continuous batching verify requests with variable speculation lengths in a single target forward pass?", "chain_ids": ["cloud-chain-auto-025-11"], "chain_positions": {"cloud-chain-auto-025-11": 2}, "chain_tiers": {"cloud-chain-auto-025-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3173", "title": "Adaptive Speculation Depth Based on Token Entropy", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Design an entropy-based adaptive K system that monitors draft model confidence and adjusts speculation depth token-by-token.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3174", "title": "Full-Stack Speculative Decoding Architecture", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What throughput can 128 H100s provide if there are 16 8-GPU replicas, batch 16 per replica, 3.5 tokens per step, and 84 ms steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3175", "title": "Speculative Decoding Token Probability Calibration", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why must speculative decoding use the full vocabulary probability distributions rather than just comparing top-1 tokens, and what property does the rejection sampling procedure guarantee?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3176", "title": "Speculative Decoding vs Increasing Batch Size", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For options (A) speculative decoding (K=5), (B) batch=8, and (C) both, how do latency, throughput, and GPU utilization compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3177", "title": "EAGLE-2 Self-Speculative Architecture", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an EAGLE-style draft head for a 70B target with hidden_dim=8192, and what memory and training costs follow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3178", "title": "Speculative Decoding Verification Compute Cost", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does verifying K=5 draft tokens take about the same wall-clock time as one batch-1 autoregressive step, and what is the cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3179", "title": "Speculative Decoding for Fill-in-the-Middle Tasks", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does fill-in-the-middle generation affect 7B draft acceptance for a 70B target, and how should speculation be modified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3180", "title": "Speculative Decoding Impact on Time-to-First-Token", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did speculative decoding increase TTFT from 200 ms to 350 ms, and how would you fix the first-token delay?", "chain_ids": ["cloud-chain-auto-025-11"], "chain_positions": {"cloud-chain-auto-025-11": 0}, "chain_tiers": {"cloud-chain-auto-025-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3181", "title": "Parallel Draft and Target Execution Scheduling", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you overlap draft generation of round N+1 with target verification of round N, and what rollback issue arises?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3182", "title": "Speculative Decoding Break-Even Analysis", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For K=5 with 2 ms draft tokens and 42 ms verification, what alpha_min makes speculation break even, and what is the general formula?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 3}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3183", "title": "Speculative Decoding for Batched Multi-Turn Chat", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Does the 7B draft KV cache for 8K-token histories limit capacity for 100 concurrent chat sessions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3184", "title": "Staged Speculation for Ultra-Long Generation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should speculation be staged across a 10K-token 405B generation, and why does tree speculation help most in the body?", "chain_ids": ["cloud-chain-auto-025-10"], "chain_positions": {"cloud-chain-auto-025-10": 2}, "chain_tiers": {"cloud-chain-auto-025-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3185", "title": "Speculative Decoding Cost-Benefit for Serving Economics", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you meet a 15 ms/token P50 target by adding speculation or doubling from 64 to 128 H100s, and what are the cost and tail-latency tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3186", "title": "MoE All-to-All Communication Volume", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a local micro-batch of 2048 tokens per GPU, what is the all-to-all communication volume per layer in each direction?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 1}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3187", "title": "MoE Capacity Factor and Token Dropping", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you raise the MoE capacity factor from 1.25 to 2.0 to fix 8% token drops, or address routing collapse another way?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3188", "title": "MoE Routing Collapse During Training", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the stalled training and low router entropy despite a converged load-balancing loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3189", "title": "MoE Memory Footprint vs Dense Equivalent", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GPUs does each require, and what is the memory efficiency gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3190", "title": "MoE Expert Replication for Hot Experts", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you reduce P99 latency when 2 of 8 MoE experts receive 40% of tokens, and what hardware cost it adds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3191", "title": "MoE Gating Network Overhead", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the parameter count of the router, and what fraction of total layer parameters does it represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3192", "title": "MoE Expert-Choice vs Token-Choice Routing", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the training and serving tradeoffs between token-choice and expert-choice routing for a 64-expert MoE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3193", "title": "MoE Serving with Offloaded Experts", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you offload experts for a 671B, 256-expert MoE on one 8x H100 node, and what latency does PCIe fetching add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3194", "title": "MoE Auxiliary Loss Coefficient Tuning", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you tune MoE routing when α=0.01 leaves dead experts but α=0.1 hurts perplexity?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 2}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3195", "title": "MoE Shared Expert Architecture", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does adding one shared expert alongside 8 routed experts reduce MoE all-to-all communication, and what bottleneck can it introduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3196", "title": "MoE Training Throughput vs Dense Model", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 8x7B MoE 34% slower than a dense 13B on 64 H100s despite similar active FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3197", "title": "MoE Granularity: Few Large vs Many Small Experts", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the systems tradeoffs between 8x125B coarse experts and 256x4B top-8 fine-grained experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3198", "title": "MoE Batch Size Impact on Expert Utilization", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "If batch size 4 produces 8 independent expert choices over 8 experts, how many distinct experts are expected to be activated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3199", "title": "MoE Quantization Strategy for Serving", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using consistent parameter counts for Mixtral 8x22B, how much memory does a BF16-shared, INT4-expert quantization plan need, and how many 80GB GPUs are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3200", "title": "MoE Fine-Grained Expert Parallelism Communication", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What communication constraints and placement rules should the all-to-all expert dispatch use across 32 H100 nodes with NVLink and NDR?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 2}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3201", "title": "MoE Prefill vs Decode Phase Routing Differences", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What parallelism layouts should you use for MoE prefill and decode, and why should they differ?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3202", "title": "MoE Parameter Count vs Active FLOPs Scaling", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-token FLOPs and total parameters for the dense 70B and 8x MoE, and why can MoE match quality at lower cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3203", "title": "MoE Checkpointing Overhead at Scale", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What checkpoint size, write time, and nonblocking strategy are needed to checkpoint the 671B MoE every 15 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3204", "title": "MoE Inference Token Routing Latency Breakdown", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency breakdown for one MoE decode layer at batch 32, including router, dispatch, expert compute, and combine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3205", "title": "MoE vs Dense Scaling Law Crossover", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the estimated GPT-4-scale FLOP budget, and what is the exact loss reduction from scaling compute by 8×?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 3}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3206", "title": "MoE Auxiliary Loss Interaction with Gradient Accumulation", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does changing gradient accumulation from 1 to 8 destabilize MoE routing, and how would you fix the load-balancing signal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3207", "title": "MoE Expert Specialization Analysis", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this specialization desirable or a sign of poor routing, and how does this affect serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3208", "title": "MoE All-to-All Overlap with Expert Computation", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can K=4 sub-batching overlap all-to-all with expert compute when communication is 40% of the step, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3209", "title": "MoE Dropless (dMoE) Token Processing Guarantee", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does dropless MoE eliminate token dropping, and what are the memory implications on H100 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3210", "title": "MoE Multi-Node Placement Strategy", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should you place 256 top-8 experts across 4 H100 nodes when some expert pairs are co-activated 70% of the time?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 3}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3211", "title": "MoE Dynamic Expert Loading from SSD", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a tiered caching system for 1024 two-GB experts across GPU HBM, CPU DRAM, and NVMe SSDs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3212", "title": "MoE Serving Cost-per-Token Economics", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What active and full FP16 weight footprints should you use for Mixtral 8x7B in the serving cost comparison, and why do both models require the same number of GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3213", "title": "MoE Sparse Upcycling from Dense Model", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the systems costs and expected quality tradeoffs of sparse upcycling a dense 7B model into an 8-expert MoE versus training from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3214", "title": "MoE Model Parallelism Topology for Training", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What EP, TP, PP, and DP dimensions would you use to train this 671B, 256-expert MoE on 2048 H100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3215", "title": "KV-Cache Size Calculation for GQA Models", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the BF16 KV-cache size per token and for batch 32 at 4096 context for Llama 3 70B with 8 KV heads?", "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 1}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3216", "title": "PagedAttention Block Size Selection", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which PagedAttention block size—8, 16, or 32 tokens—should you choose for this variable-length workload, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3217", "title": "KV-Cache Quantization to INT8", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can INT8 KV-cache quantization double concurrency from 64 to 128 at 4K context, and what quality impact should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3218", "title": "Prefix Caching for Shared System Prompts", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much KV-cache memory and TTFT can prefix caching save when 80% of requests share the 2048-token system prompt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3219", "title": "KV Cache Eviction Under Memory Pressure", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What eviction and admission policy should you use to handle 80 requests requiring up to 204 GB of KV-cache with only a 35 GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3220", "title": "KV-Cache Memory as Throughput Bottleneck", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the Llama 3 8B H100 instance limited to 40% compute utilization despite 200 queued requests and nearly full memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3221", "title": "Continuous Batching Interaction with KV-Cache", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does moving from static to continuous batching change KV-cache allocation, eviction, and scheduling requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3222", "title": "KV-Cache Disaggregation for Prefill-Decode Split", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How large is the Llama 3 70B KV-cache for an 8K prefill, and how should the prefill pool hand it off to the decode pool?", "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 3}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3223", "title": "GQA vs MQA vs MHA KV-Cache Tradeoff", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which attention variants are memory-feasible for batch=16 at 128K context on one GPU, and what changes if none fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3224", "title": "KV-Cache Impact on Decode Memory Bandwidth", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At position 4096, how much bandwidth is spent loading KV-cache versus 70B model weights during decode, and which dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3225", "title": "KV-Cache Memory Fragmentation in Long-Running Services", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does vLLM reject new requests despite 30 GB free GPU memory and only 22 GB of 35 GB allocated KV-cache actively used?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3226", "title": "KV-Cache Compression via Sliding Window + Sink Tokens", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should you manage KV-cache for 20K-token conversations on Mistral 7B while respecting its 4096-token sliding window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3227", "title": "KV-Cache Aware Autoscaling Policy", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What autoscaling metric should replace GPU utilization when requests are rejected at 45% GPU utilization during a traffic spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3228", "title": "KV-Cache Memory Planning for Multi-Turn Chat", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does KV-cache grow over 6 turns for Llama 3 8B, and how many concurrent conversations fit on one A100-80GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3229", "title": "KV-Cache Recomputation vs Storage Tradeoff", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For Llama 3 70B on 2×H100, which is more economical: storing the KV-cache or recomputing it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3230", "title": "KV-Cache and Speculative Decoding Interaction", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should speculative decoding manage draft and verifier KV-caches when 5 of 8 candidate tokens are accepted?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3231", "title": "KV-Cache Scaling with Context Length Doubling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does doubling context length affect KV-cache memory, decode latency, and maximum batch size for the 96-layer GQA model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3232", "title": "KV-Cache Sharing for Parallel Sampling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can best-of-16 sampling share KV-cache for the common 2048-token prompt, and how much memory does it save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3233", "title": "Disaggregated KV-Cache Storage Architecture", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a 100 GB/s CXL-backed distributed KV-cache for Llama 3 70B, and when does it outperform local memory KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3234", "title": "KV-Cache Aware Request Scheduling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the load balancer be modified to prevent replica 1 from rejecting requests while replicas 2-4 remain underutilized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3235", "title": "KV-Cache Growth in RAG Pipelines", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does the RAG context growth to 16,128 tokens scale KV-cache memory, and what does that imply for serving cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3236", "title": "KV-Cache Pool Sizing for Throughput Optimization", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How much memory per GPU should be allocated to the KV-cache pool in the 4×A100 TP=4 deployment, and what throughput does it enable?", "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 2}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3237", "title": "KV-Cache Deduplication Across Requests", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What fraction of total traffic can reuse KV-cache entries under content-aware deduplication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3238", "title": "KV-Cache Memory vs Compute Roofline During Decode", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV-cache data must be streamed per layer for one Llama 3 8B decode token at sequence position 2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3239", "title": "KV-Cache and Multi-Modal Input Processing", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you manage KV-cache memory for 8 mixed text-only and image+text requests with 1600 visual tokens and 512 text tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3240", "title": "Diagnosing Workload Bottlenecks on NVIDIA H100 with Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use a roofline model on the H100 to diagnose the 50 TFLOPS FP16 workload, and what are examples of memory-bound versus compute-bound operations?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 3}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3241", "title": "Optimizing Large Language Model Inference on AMD MI300X with Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the roofline model, is the 70B FP16 LLM on MI300X compute-bound or memory-bound, and how would you improve utilization?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 5}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3242", "title": "Optimizing Large Transformer Inference on NVIDIA H100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you structure H100 CUDA kernels to maximize occupancy, coalesced memory access, and FP16 Tensor Core utilization for transformer inference?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3243", "title": "Diagnosing Underperforming LLM Inference on AMD MI300X", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What MI300X architectural factors could explain 1 TFLOPS at batch size 1 despite high utilization and only 1 TB/s HBM bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3244", "title": "Optimizing Large Language Model Inference on NVIDIA H100 for High-Throughput", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile and optimize the 60 GB FP16 LLM on one H100 for occupancy, Tensor Cores, coalesced memory access, and latency-throughput trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 3}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3245", "title": "NVIDIA H100 vs. CPU: Optimizing Large Language Model Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is an NVIDIA H100 generally better than a multi-core CPU for LLM inference, and when might a CPU still be preferable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3246", "title": "LLM Inference Scaling on Google TPU v5e", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a single TPU v5e unsuitable for a 70B BF16 LLM, and how would you shard the model across accelerators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3247", "title": "AMD MI300X Accelerator Selection for Large Language Model Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an MI300X-based LLM inference architecture and justify it against GPUs, TPUs, and custom ASICs?", "chain_ids": ["cloud-chain-auto-secondary-016-01"], "chain_positions": {"cloud-chain-auto-secondary-016-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3248", "title": "Optimizing Large Language Model Inference on a Single NVIDIA A100", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fit and serve a 175B FP16 LLM on a single 80 GB A100 while maximizing low-latency inference throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3249", "title": "Real-time LLM Inference: Accelerator Selection for Performance and Cost", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What serving architecture and accelerator choice would meet 100 ms latency and 10,000 QPS for the BF16 transformer on Google Cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3250", "title": "Accelerator Selection for Large Language Model Inference on AMD MI300X", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How suitable is the MI300X for batch-1 FP16 inference of a 70B LLM compared with CPUs or previous-generation GPUs?", "chain_ids": ["cloud-chain-auto-secondary-016-01"], "chain_positions": {"cloud-chain-auto-secondary-016-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3251", "title": "NVIDIA A100 vs. High-End CPU for LLM Training", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare a GPU with dual CPUs for FP16 LLM training in performance, cost-efficiency, and programmability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3252", "title": "Optimizing Large Language Model Inference on AMD MI300X", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you serve a 100B FP16 LLM on MI300X versus a custom ASIC or CPUs, and optimize data flow despite the 192 GB HBM limit?", "chain_ids": ["cloud-chain-auto-secondary-016-01"], "chain_positions": {"cloud-chain-auto-secondary-016-01": 2}, "chain_tiers": {"cloud-chain-auto-secondary-016-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3253", "title": "Optimizing Large Language Model Inference on Google TPU v5e with Systolic Arrays", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you tile the 20B BF16 transformer's GEMMs on TPU v5e, and when would you choose weight-stationary versus output-stationary dataflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3254", "title": "Optimizing Dense GEMM on AMD MI300X with Weight-Stationary Dataflow", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would a weight-stationary dataflow work for the 65536x512 by 512x65536 FP16 GEMM on MI300X, and is it compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3255", "title": "Optimizing Large Language Model Inference on AMD MI300X with Systolic Arrays", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you map a Llama 3 70B FFN matrix multiply onto a systolic array, including dataflow choice, tiling, and structured sparsity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3256", "title": "AMD MI300X HBM/Compute Bottleneck and Cost Analysis for LLM Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the 100 TFLOP, 100 GB-per-inference MI300X service compute-bound or memory-bound, and what are the max inferences/sec and cost/inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3257", "title": "A100 LLM Cost Analysis: Training & Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate A100 GPU-hours and cost for 10^24 FLOP training plus 100 GPU-hours of inference, and what bottlenecks affect accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3258", "title": "Compute Cost Estimation for a Large Language Model on NVIDIA H100s", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate GPU-hours and cost to train a 70B LLM on 1T tokens at 60% utilization and $3.50 per GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3259", "title": "LLM Training Cost Estimation on Google TPU v5e", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate TPU v5e device-hours and dollar cost to train a 70B model on 1T tokens at 40% efficiency, and what else matters beyond FLOPs?", "chain_ids": ["cloud-chain-auto-secondary-015-08"], "chain_positions": {"cloud-chain-auto-secondary-015-08": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3260", "title": "H100 Training Cost for Large Language Model", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What GPU-hours, instance cost, electricity cost, and total cost do you estimate for the 100B-parameter training run?", "chain_ids": ["cloud-chain-auto-secondary-015-06"], "chain_positions": {"cloud-chain-auto-secondary-015-06": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3261", "title": "Optimizing Cost-Performance on Google TPU v5e for LLM Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate TPU v5e device count, device-hours, and cost to serve a 70B BF16 model at 200 ms P99 and 1000 QPS?", "chain_ids": ["cloud-chain-auto-secondary-015-08"], "chain_positions": {"cloud-chain-auto-secondary-015-08": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3262", "title": "LLM Fine-tuning Cost Estimation on AMD MI300X", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate MI300X GPU-hours and cost to fine-tune a 70B FP16 LLM on 1T tokens, and how could memory bandwidth change the estimate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3263", "title": "LLM Training Cost Estimation on NVIDIA H100", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate total FLOPs, GPU-hours, and dollar cost to train a 70B model on 2T tokens, and how would you refine it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3264", "title": "VRAM Budgeting for 70B LLM on AMD MI300X", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you account for VRAM used by FP16 weights, AdamW state, activations, and KV-cache for 70B training and batch-8 inference?", "chain_ids": ["cloud-chain-auto-008-19"], "chain_positions": {"cloud-chain-auto-008-19": 0}, "chain_tiers": {"cloud-chain-auto-008-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3265", "title": "KV-Cache Optimization on AMD MI300X for Varying Context Lengths", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache management change would you make to avoid OOMs and erratic latency with frequent 64k-token requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3266", "title": "Optimizing LLM KV-Cache on NVIDIA A100 for Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache architecture would you use on A100s to handle 32k-64k contexts without OOMs while maximizing throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3267", "title": "NVIDIA H100 KV-Cache Capacity Planning", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "With 80 GB of HBM used only for BF16 KV-cache, how many total tokens can be stored for 96 heads of dimension 128?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3268", "title": "Diagnosing KV-Cache Eviction on Google TPU v5e", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose this issue on the TPU v5e, pinpointing the root cause, and what initial steps would you take to mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3269", "title": "Optimizing KV-Cache on AMD MI300X for High-Throughput LLM Inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design KV-cache paging and eviction on MI300X to maximize concurrent 128k-token contexts with 64 KB of KV per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3270", "title": "A100 KV-Cache Management for Long Context LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where is the memory pressure for 32k-context LLM inference on an 80 GB A100, and how would you prevent OOMs while maximizing throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3271", "title": "Optimizing KV-Cache on Google TPU v5e for Long Context LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you size, page, and evict the KV-cache for 128k-context 7B inference on a 16 GB TPU v5e to reduce pressure and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3272", "title": "Optimizing KV-Cache Management on NVIDIA A100 for Long-Context LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design and size a paged KV-cache system on 80 GB A100s for long-context serving with dynamic eviction and low tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3273", "title": "NVIDIA A100 Memory Hierarchy Bottlenecks for Large Models", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a 150B FP16 LLM problematic on a single 80 GB A100 despite a 50 GB active working set, and what memory-hierarchy trade-offs result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3274", "title": "Analyzing Compute-Memory Tradeoffs with Gradient Checkpointing on NVIDIA H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does gradient checkpointing reduce observed TFLOPS, and how would you optimize the compute-memory trade-off?", "chain_ids": ["cloud-chain-auto-027-24"], "chain_positions": {"cloud-chain-auto-027-24": 1}, "chain_tiers": {"cloud-chain-auto-027-24": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3275", "title": "TPU v5e Activation Checkpointing Strategy for Large Language Models", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you incorporate activation checkpointing for a 100B-parameter MoE on TPU v5e when activations exceed 16 GB HBM per chip?", "chain_ids": ["cloud-chain-auto-027-25"], "chain_positions": {"cloud-chain-auto-027-25": 0}, "chain_tiers": {"cloud-chain-auto-027-25": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3276", "title": "Optimizing LLM Training with Gradient Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How much memory does checkpointing save for the 250 GB activation block, and what extra latency does recomputation add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3277", "title": "Optimizing Large Model Training on NVIDIA A100 with Gradient Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you apply gradient checkpointing to train the 50B Transformer on one 80 GB A100, and what compute-memory trade-off would you expect?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 2}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3278", "title": "H100 Memory Optimization for Large Language Models: Checkpointing vs. Parallelism", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose between aggressive checkpointing on one accelerator and pipeline parallelism across accelerators for a 1T-parameter MoE model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3279", "title": "TPU v5e Activation Memory & Gradient Checkpointing for LLMs", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you apply gradient checkpointing on a TPU to fit the LLM within 16 GB HBM while preserving training throughput?", "chain_ids": ["cloud-chain-auto-027-25"], "chain_positions": {"cloud-chain-auto-027-25": 1}, "chain_tiers": {"cloud-chain-auto-027-25": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3280", "title": "Optimizing Large Model Training on AMD MI300X with Activation Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose activation-memory OOMs and choose a selective checkpointing strategy that quantifies memory saved versus recompute cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3281", "title": "Optimizing Large Model Training with Gradient Checkpointing on A100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would gradient checkpointing let this billions-parameter Transformer train on an 80 GB A100, and how would you tune the compute-memory tradeoff?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 3}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3282", "title": "TPU v5e Data Transfer Bottleneck Analysis", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are host-to-TPU activation transfers dominating throughput on TPU v5e, and how would you optimize data movement to reduce that bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3283", "title": "Optimizing Large Embedding Data Movement on AMD MI300X", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design host-device data movement to minimize latency for embeddings and KV cache transfers given PCIe Gen5 bottlenecks?", "chain_ids": ["cloud-chain-auto-secondary-017-18"], "chain_positions": {"cloud-chain-auto-secondary-017-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3284", "title": "Optimizing Data Movement for LLM Inference on Google TPU v5e", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the TPU v5e data movement strategy for real-time LLM inference when weights, activations, and datasets exceed 16 GB HBM?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 3}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3285", "title": "Optimizing Data Movement for LLM Inference: A100 vs. V100 Evaluation", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate these two architectures considering the impact of host-device data movement overhead, DMA transfers, and the potential benefits of zero-copy techniques and pinned memory on the overall inference latency and throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3286", "title": "Optimizing Data Movement for Large Models on NVIDIA H100", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you minimize host-device transfer overhead on an H100 when fine-tuning a 100B model with large optimizer states or embeddings offloaded to CPU memory?", "chain_ids": ["cloud-chain-auto-secondary-017-18"], "chain_positions": {"cloud-chain-auto-secondary-017-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3287", "title": "TPU v5e Memory Management: Diagnosing OOMs in Large Model Training", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do OOMs still occur on 16 GB HBM despite optimizer offload, and how would you reduce the peak memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3288", "title": "Designing a Memory-Efficient LLM Inference System on AMD MI300X", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you manage memory for a 130B LLM on a 192 GB GPU to avoid OOMs and page thrashing while keeping inference latency low?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3289", "title": "Optimizing LLM Deployment on A100: Mitigating Memory Fragmentation and OOM", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate A100 memory fragmentation and OS-level eviction under peak load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3290", "title": "LLM Memory Pressure Management on NVIDIA H100 Cluster", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a distributed training strategy to prevent OOM errors, minimize fragmentation, and optimize gradient accumulation for a 175B LLM on H100s?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 2}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3291", "title": "LLM Training on AMD MI300X: Memory Pressure Management", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate OOMs on MI300X training runs, including fragmentation, gradient accumulation, and OS-level eviction effects?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3292", "title": "Optimizing 70B LLM Training on NVIDIA A100 Under Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and resolve these memory pressure issues to ensure stable and efficient training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3293", "title": "LLM Fine-tuning OOM on NVIDIA H100: Diagnosing and Mitigating Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate OOMs for a 100B distributed LLM, including fragmentation, gradient accumulation, offload, and OS eviction effects?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 3}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3294", "title": "LLM Inference Latency Decomposition on NVIDIA A100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce the 550 ms end-to-end latency across network, CPU, GPU, and postprocessing, and what are the specific bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3296", "title": "Designing Low-Latency, High-Throughput LLM Inference with Dynamic Batching on NVIDIA A100", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the system, including scheduling policies, to leverage the hardware's capabilities effectively while meeting both latency and throughput KPIs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3297", "title": "Optimizing Real-time LLM Inference on NVIDIA H100s with Adaptive Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design batching and scheduling to minimize latency while sustaining 1000 QPS P99 throughput under variable arrivals?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3298", "title": "Optimizing Batching Strategies for LLM Inference on Google TPU v5e", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you choose among static, dynamic, and continuous batching on TPU v5e to maximize throughput while meeting a 100 ms P99 latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3299", "title": "Optimizing Real-time LLM Inference Latency and Throughput on NVIDIA A100 with Adaptive Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design adaptive batching for LLM inference to achieve sub-50 ms p99 latency while maximizing throughput under variable traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3300", "title": "Understanding Tail Latency in AMD MI300X Inference", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What causes P999 stragglers on the MI300X despite 5.3 TB/s HBM3 bandwidth, and what first mitigations would you try?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3301", "title": "NVIDIA H100 LLM Inference: Achieving P99 Latency SLAs", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you leverage techniques like straggler detection, hedged requests, and SLA-driven resource management to ensure this critical P99 target is always met?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3302", "title": "Optimizing Tail Latency for Real-time Inference on Google TPU v5e", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you reduce P999 latency below 100 ms on TPU v5e when most inferences take 10 ms but a small fraction become stragglers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3303", "title": "Designing a Low-Latency Inference Service on NVIDIA A100 with Strict SLAs", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the inference service to consistently meet the <100ms P99 and <250ms P999 tail latency SLA given variable traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3304", "title": "H100 Inference Service P99 Latency with Stragglers", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design your system to effectively monitor, mitigate, and predictably meet the P99 latency SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3305", "title": "Optimizing Tail Latency for Real-time ML Inference on Google TPU v5e", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What multi-TPU v5e architecture reduces stragglers to meet a 100ms P99 SLA, and how does it compare mathematically to a single TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3306", "title": "H100 LLM Training Latency Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the initial steps to diagnose this latency issue, and what specific H100 hardware characteristics should be considered during the analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3307", "title": "Optimizing Low-Latency LLM Inference on AMD MI300X: Profiling and Bottleneck Identification", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile an MI300X LLM inference service from system metrics down to GPU kernels to find the root causes of inconsistent latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3308", "title": "Optimizing LLM Inference Latency on NVIDIA A100 with Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use profiling and tracing tools to systematically determine whether high inference latency is compute-, memory-, or I/O-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3309", "title": "NVIDIA H100 Latency Diagnosis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you pinpoint the bottleneck causing 150 ms H100 inference latency when batch size is tuned, CPU is idle, and nvidia-smi shows 90% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3310", "title": "TPU v5e Latency Bottleneck Analysis for Large Language Models", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile TPU v5e inference to determine whether peak-time tail latency is compute-bound, memory-bound, or I/O-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3311", "title": "Optimizing Large Language Model Inference Latency on AMD MI300X", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile MI300X LLM inference to reduce 500 ms average latency toward 100 ms across compute, memory, and I/O bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3312", "title": "NVIDIA H100 Inference Queue Depth and Little's Law", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Little's Law to choose queue depth and capacity for a single accelerator while keeping average inference latency at 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3313", "title": "Optimizing Inference Latency on AMD MI300X with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Little’s Law and M/G/1 queueing to set arrival-rate limits and queue depth for 25 ms MI300X inference under a 50 ms p99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3314", "title": "Diagnosing H100 Inference Latency Spikes with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use queueing theory to diagnose P99 spikes above 1000 ms when average utilization is 60-70% but scheduler queue depth is high?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3315", "title": "Optimizing Real-Time ML Inference Latency on Google TPU v5e with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many TPU v5e devices and what queueing strategy would you use to serve 10,000 QPS with 50 ms average latency and 150 ms P99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3316", "title": "LLM Inference Queue Management on AMD MI300X", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you set per-GPU queue depth and capacity plans for 200 ms service time while keeping average end-to-end latency at 500 ms under spikes?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 0}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3317", "title": "Optimizing LLM Inference Latency with Queueing Theory on NVIDIA A100s", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you apply queueing theory to analyze performance, determine optimal queue depth, and plan capacity to meet the 200ms P99 latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3318", "title": "Optimizing LLM Inference Latency on AMD MI300X with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you apply queueing theory principles to determine optimal queue depths, manage arrival rates, and plan capacity to minimize latency and prevent resource starvation or underutilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3319", "title": "Memory Optimization for LLMs on TPU v5e with Mixed-Precision Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 10B parameter model, how many parameters must be quantized from BF16 to INT8 to fit the model within 16 GB of HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3320", "title": "Optimizing LLM Deployment with INT8 Quantization on AMD MI300X", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What technical steps and tradeoffs would you consider to convert the 100 GB FP16 LLM to INT8 on an MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3321", "title": "Quantization Strategy for LLM Deployment on NVIDIA A100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and justify an INT8/INT4 quantization strategy on an 80GB GPU, including PTQ vs QAT, granularity, zero-points, and expected gains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3322", "title": "H100 Mixed-Precision Performance Considerations", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you implement mixed-precision training on the GPU and verify it benefits from Tensor Cores and HBM bandwidth?", "chain_ids": ["cloud-chain-auto-secondary-015-02"], "chain_positions": {"cloud-chain-auto-secondary-015-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3323", "title": "Analyzing Mixed-Precision Performance on Google TPU v5e", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would migrating this FP32 LLM to BF16 mixed precision on TPU v5e affect memory, throughput, stability, and design choices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3324", "title": "Optimizing Large Language Model Training with Mixed Precision on AMD MI300X", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What mixed-precision training strategy would you use on MI300X for this FP32 LLM, and how would you manage stability and accuracy tradeoffs?", "chain_ids": ["cloud-chain-auto-secondary-015-04"], "chain_positions": {"cloud-chain-auto-secondary-015-04": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3325", "title": "Optimizing Large Model Training with Mixed Precision on NVIDIA A100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For the 175B-parameter LLM, what are the FP32/FP16 parameter memory needs, theoretical FP16 speedup, and critical stability technique?", "chain_ids": ["cloud-chain-auto-secondary-015-05"], "chain_positions": {"cloud-chain-auto-secondary-015-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3326", "title": "H100 Mixed-Precision Training Instability and Performance Diagnosis", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of these stability and performance issues, considering the specific hardware capabilities?", "chain_ids": ["cloud-chain-auto-secondary-015-02"], "chain_positions": {"cloud-chain-auto-secondary-015-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3327", "title": "Optimizing LLM Training Throughput and Memory with Mixed Precision on TPU v5e", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What mixed-precision training strategy would you use on TPU v5e to fit the model in 16 GB HBM while preserving FP32-like accuracy?", "chain_ids": ["cloud-chain-auto-secondary-015-03"], "chain_positions": {"cloud-chain-auto-secondary-015-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3328", "title": "Optimizing Large Language Model Training with Mixed Precision on NVIDIA A100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you compare FP16 AMP with BF16 mixed precision, and which would you initially pursue for this LLM?", "chain_ids": ["cloud-chain-auto-secondary-015-05"], "chain_positions": {"cloud-chain-auto-secondary-015-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3329", "title": "Optimizing Large Model Training with Mixed-Precision on Google TPU v5e", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the OOM and low throughput for a 70B FP32 LLM on a TPU v5e and design a mixed-precision strategy to fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-03"], "chain_positions": {"cloud-chain-auto-secondary-015-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3330", "title": "Optimizing LLM Training with Mixed-Precision on AMD MI300X", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design mixed-precision training for a 175B LLM on MI300X, including dtype choices, loss scaling, stability, and accelerator sizing?", "chain_ids": ["cloud-chain-auto-secondary-015-04"], "chain_positions": {"cloud-chain-auto-secondary-015-04": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3331", "title": "NVIDIA A100 LLM Inference Throughput Under Power Constraints", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose this issue and explain the behavior despite high utilization, considering power management concepts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3332", "title": "Power-Efficient LLM Inference on Google TPU v5e", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use TPU v5e power caps, DVFS P-states, and the CMOS power equation to minimize energy per token while meeting a 50 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3333", "title": "Energy-Efficient LLM Inference on AMD MI300X Cluster", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize tokens per Joule while meeting throughput and latency targets using TDP, power caps, DVFS, and P≈CV²f?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3334", "title": "Google TPU v5e Thermal Limits and Sustained Performance Recall", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What thermal mechanisms limit TPU v5e burst vs sustained BF16 performance, and how would higher ambient temperature affect them?", "chain_ids": ["cloud-chain-auto-secondary-015-44"], "chain_positions": {"cloud-chain-auto-secondary-015-44": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-44": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3335", "title": "AMD MI300X Thermal Throttling Analysis for LLM Inference", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does MI300X inference drop to about 60% of peak after sustained load, and what telemetry would indicate thermal throttling?", "chain_ids": ["cloud-chain-auto-secondary-015-45"], "chain_positions": {"cloud-chain-auto-secondary-015-45": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3336", "title": "Designing for Sustained Performance on NVIDIA A100: Thermal Management for Large-Scale AI", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design thermal management for an A100 inference cluster at up to 30°C ambient to sustain high FP16 throughput without throttling?", "chain_ids": ["cloud-chain-auto-secondary-015-46"], "chain_positions": {"cloud-chain-auto-secondary-015-46": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-46": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3337", "title": "Diagnosing Performance Variability on Google TPU v5e due to Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose whether TPU training slowdowns after an hour are caused by thermal throttling and ambient temperature effects?", "chain_ids": ["cloud-chain-auto-secondary-015-44"], "chain_positions": {"cloud-chain-auto-secondary-015-44": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-44": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3338", "title": "Cloud LLM Thermal Design with AMD MI300X", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design cooling and throttling mitigation for a two-MI300X server node to maximize sustained training performance and meet SLAs?", "chain_ids": ["cloud-chain-auto-secondary-015-45"], "chain_positions": {"cloud-chain-auto-secondary-015-45": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3339", "title": "NVIDIA A100 Thermal Throttling in Cloud Inference at Scale", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal-aware orchestration, power, and cooling strategy would you use to prevent A100 throttling under 35°C rack hotspots?", "chain_ids": ["cloud-chain-auto-secondary-015-46"], "chain_positions": {"cloud-chain-auto-secondary-015-46": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-46": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3340", "title": "Optimizing LLM Inference Energy on AMD MI300X", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might MI300X LLM inference have low tokens/s and high energy cost despite massive TFLOPS, and how do energy-aware operators help?", "chain_ids": ["cloud-chain-auto-secondary-009-18"], "chain_positions": {"cloud-chain-auto-secondary-009-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3341", "title": "Energy-Aware MoE Inference on AMD MI300X", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an MI300X MoE inference pipeline to maximize throughput within a power envelope using energy-aware operator selection?", "chain_ids": ["cloud-chain-auto-secondary-009-18"], "chain_positions": {"cloud-chain-auto-secondary-009-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3342", "title": "Optimizing Energy Efficiency for LLM Inference on NVIDIA A100", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce energy per query for an A100 MoE LLM whose expert weights are frequently loaded from HBM2e under a latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3343", "title": "NVIDIA A100 Cluster Power and Cooling Optimization", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize power, cooling, and PUE for 100 GPUs, and what carbon-aware scheduling strategies apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3344", "title": "Datacenter Architecture for H100-based AI Cluster with Carbon Efficiency Goals", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design power, cooling, and carbon-aware scheduling for an H100 training cluster targeting PUE 1.1 over a 5-year lifecycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3345", "title": "Diagnosing Unexpected Power Overages in an AMD MI300X Cluster", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What could be the root cause of these symptoms, and how would you methodically diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3346", "title": "Datacenter Efficiency: A100 Rack Design for Carbon-Aware ML in Cloud", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fit 16 A100s into a 10 kW rack budget with cooling, layout, carbon-aware scheduling, and 5-year carbon tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3347", "title": "NVIDIA H100 Rack Power and PUE Analysis", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you calculate whether 16 GPUs fit in a 15 kW rack and estimate grid power at PUE 1.6?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 1}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3349", "title": "Optimizing H100-powered AI Clusters for Carbon-Aware Datacenter Efficiency", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design rack power, liquid cooling, carbon-aware scheduling, and embodied-vs-operational carbon strategy for an H100 datacenter at PUE 1.15?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3350", "title": "Optimizing Large Transformer Inference on Google TPU v5e: Attention and KV-Cache Scaling", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do KV-cache growth and O(N²) attention bottleneck a 70B Transformer on TPU v5e, and what attention or quantization changes would mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3351", "title": "Optimizing KV-Cache Memory on AMD MI300X for Large Language Models", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What formula and maximum sequence length apply at batch size 8 before KV-cache OOM when 140 GB of the 192 GB HBM holds weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3352", "title": "NVIDIA A100 Performance Bottleneck in Large Context LLM Inference with Multi-Head Attention", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does standard MHA cause memory and latency bottlenecks for 64k contexts on an A100, and what attention optimizations would you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3353", "title": "Scaling LLM Inference with Attention Variants", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What attention mechanism and context-scaling strategy would you use for 128k-token LLM inference, and how would you justify it quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3354", "title": "Scaling Attention for Long Contexts", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which attention modification would you choose for 128K-token inference, and how does it address memory and latency bottlenecks?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3355", "title": "Optimizing LLM Context with Attention Scaling on NVIDIA H100", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate MHA versus MQA or GQA for 128k-token LLM inference on H100, including memory, bandwidth, latency, and quality tradeoffs?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 3}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3356", "title": "Optimizing LLM Attention Scaling on AMD MI300X for Long Context", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose KV-cache bandwidth bottlenecks in MHA and quantify the benefit of switching to GQA, MQA, or sliding-window attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3357", "title": "Scaling Long-Context Attention on NVIDIA A100 for LLM Inference", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the system to handle the memory and compute requirements, considering techniques like grouped-query attention or sliding window attention?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 4}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3358", "title": "MoE Inference Optimization on NVIDIA A100", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you configure expert parallelism and MoE routing to achieve high throughput and low latency under severe memory bandwidth constraints?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 0}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3359", "title": "Optimizing MoE Routing on NVIDIA H100 for Latency-Sensitive Inference", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are GPU MoE throughput and latency scaling poorly, and how would you mitigate routing, capacity, and memory-bandwidth bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3360", "title": "Scaling a 1 Trillion Parameter Mixture of Experts Model on NVIDIA A100s", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and size a cost-minimal MoE inference system for 10,000 QPS at 50 ms token latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3361", "title": "MoE Routing and H100 Capacity Planning for Large-Scale Inference", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce the MoE service's tail latency from expert load imbalance and HBM3 memory contention on H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3362", "title": "Optimizing MoE Routing on NVIDIA A100 for Extreme Scale", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks explain the 50 vs. 200 tokens/sec MoE throughput on A100s, and how would you quantify and mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3363", "title": "Estimating a Large Language Model's Fit on Google TPU v5e", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Without considering quantization or optimizations, what is the initial memory challenge when deploying this model on a single TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3364", "title": "LLM Deployment Feasibility on AMD MI300X", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a 70B FP16 LLM fit and run in real time on one MI300X, and what memory or bandwidth bottlenecks would you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3365", "title": "NVIDIA H100 Model Deployment Feasibility for Large Language Model", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a 175B FP16 LLM fit on one H100, what is the memory-bound token latency, and how many H100s are minimally required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3366", "title": "Estimating LLM Deployment on Google TPU v5e", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a single TPU v5e hold this 70B model plus activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3367", "title": "TPU v5e Optimization: Encoder-Decoder Tradeoffs for Mixed Workloads", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture would you recommend for optimizing performance and cost on the TPU v5e, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3368", "title": "Encoder-Decoder Tradeoffs on AMD MI300X for Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the inference compute, memory, and bandwidth costs of encoder-only, decoder-only, and encoder-decoder models on the MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3369", "title": "A100 Inference Optimization: Encoder-Decoder Tradeoffs for Real-time LLM Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the A100 tail latency and cost issues, and what architecture would you use for summarization plus response generation?", "chain_ids": ["cloud-chain-auto-secondary-016-22"], "chain_positions": {"cloud-chain-auto-secondary-016-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3370", "title": "H100-Optimized LLM: Encoder vs. Decoder Architecture Tradeoffs for Cost-Effective Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate encoder-only, decoder-only, and encoder-decoder options on H100s against 50 ms latency, throughput, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3371", "title": "A100 Deployment Strategy: Encoder-Decoder Tradeoffs for LLM Inference", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the system costs and architectural tradeoffs, specifically leveraging the A100's capabilities, to make a recommendation for a low-latency, high-throughput service?", "chain_ids": ["cloud-chain-auto-secondary-016-22"], "chain_positions": {"cloud-chain-auto-secondary-016-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-016-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3372", "title": "H100 Deployment Strategy: Encoder-Decoder Tradeoffs for LLM Cost Optimization", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture (encoder-only, decoder-only, or encoder-decoder) would you choose for query understanding and generation, and how would you quantify bottlenecks?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3373", "title": "Encoder-Decoder Architecture Tradeoffs on Google TPU v5e for Real-time Inference", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture is best for low-latency conversational AI on TPU v5e, and how do latency, throughput, HBM, and cost trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3374", "title": "H100 Performance Bottlenecks with Structured vs. Unstructured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does high pruning sparsity fail to speed up inference on H100, and how do structured and unstructured pruning differ?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 1}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3375", "title": "Optimizing LLM Deployment with Structured Sparsity on Google TPU v5e", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you change from unstructured to structured pruning on TPU v5e to reach 2x speedup, and what utilization gains would you target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3376", "title": "Optimizing Large Language Model Deployment on AMD MI300X with Structured Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you approach structured versus unstructured pruning, and what sparsity patterns would best exploit sparse compute hardware?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 3}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3377", "title": "Optimizing LLM Deployment on NVIDIA A100: Structured vs. Unstructured Pruning for Latency and Throughput", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which pruning approach and sparsity patterns would you use to keep accuracy within 1% while improving latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3378", "title": "Optimizing LLM Deployment via Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill a 70B teacher into a 7B student while balancing logit or feature distillation, HBM use, and throughput?", "chain_ids": ["cloud-chain-auto-secondary-015-32"], "chain_positions": {"cloud-chain-auto-secondary-015-32": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3379", "title": "Optimizing Real-time Inference with Knowledge Distillation on NVIDIA A100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill the image segmentation teacher into an A100-friendly student for 10,000 RPS at 50 ms p99, and why choose distillation over pruning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3380", "title": "H100 Memory Optimization for Knowledge Distillation Logit Matching", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What maximum batch size fits for logit matching with 20 GB free HBM3, sequence length 2048, vocab 50,000, and what logit precision would you use?", "chain_ids": ["cloud-chain-auto-secondary-015-30"], "chain_positions": {"cloud-chain-auto-secondary-015-30": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3381", "title": "Optimizing Knowledge Distillation for a ResNet-like Model on Google TPU v5e: A Latency and Accuracy Challenge", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What steps would you take to diagnose the root cause of the unexpected latency and lower throughput, considering the specific TPU v5e specs and the knowledge distillation approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3382", "title": "Optimizing Large Language Models with Knowledge Distillation on AMD MI300X", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose between logit matching and feature distillation, and when is distillation preferable to pruning on MI300X for sub-50ms latency?", "chain_ids": ["cloud-chain-auto-secondary-015-31"], "chain_positions": {"cloud-chain-auto-secondary-015-31": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3383", "title": "Optimizing Large Language Model Deployment on H100 via Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you size the student model and choose logit or feature distillation to balance near-teacher accuracy, latency, and cost?", "chain_ids": ["cloud-chain-auto-secondary-015-30"], "chain_positions": {"cloud-chain-auto-secondary-015-30": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3384", "title": "Optimizing Real-time LLM Inference on Google TPU v5e via Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the TPU v5e bottleneck for the 15B BF16 teacher, and how would you distill a student to hit 50 ms while retaining 98% accuracy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3385", "title": "LLM Distillation for High-Throughput Inference on AMD MI300X", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you serve the 7B distilled LLM on MI300X using quantization, sparse compute, and hardware-tailored distillation, and when would it beat pruning?", "chain_ids": ["cloud-chain-auto-secondary-015-31"], "chain_positions": {"cloud-chain-auto-secondary-015-31": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3386", "title": "H100 Memory Bottleneck in LLM Element-wise Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are the H100 element-wise operations memory-bound, and how would you fuse them to reduce global memory traffic and kernel launch overhead?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 3}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3387", "title": "Optimizing Transformer Inference on Google TPU v5e via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use operator fusion on TPU v5e to cut memory-bound element-wise latency, and how would you identify, implement, and validate it?", "chain_ids": ["cloud-chain-auto-secondary-004-25"], "chain_positions": {"cloud-chain-auto-secondary-004-25": 1}, "chain_tiers": {"cloud-chain-auto-secondary-004-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3388", "title": "Optimizing Memory-Bound Operations via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse the three memory-bound element-wise operations to reduce memory traffic and kernel launch overhead?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 4}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3389", "title": "TPU Kernel Fusion for Memory-Bound Element-wise Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse the ReLU and bias addition kernels, and what performance improvement would you expect?", "chain_ids": ["cloud-chain-auto-secondary-004-25"], "chain_positions": {"cloud-chain-auto-secondary-004-25": 0}, "chain_tiers": {"cloud-chain-auto-secondary-004-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3390", "title": "Optimizing LLM Inference on AMD MI300X via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify and fuse memory-bound element-wise operators into custom ROCm kernels on MI300X to maximize HBM3 bandwidth and cut latency?", "chain_ids": ["cloud-chain-auto-secondary-004-25"], "chain_positions": {"cloud-chain-auto-secondary-004-25": 2}, "chain_tiers": {"cloud-chain-auto-secondary-004-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3391", "title": "H100 LLM Inference Optimization: Kernel Fusion for Memory-Bound Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify, implement, and validate operator fusion for these memory-bound LLM ops to reduce kernel launches and global memory traffic?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 5}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3392", "title": "H100 Graph Optimization Strategies for LLMs", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do AOT compilation, operator lowering, and constant folding improve LLM inference on an H100?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 1}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3393", "title": "TPU v5e Graph Compilation Performance Analysis", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might a custom activation underperform on this accelerator despite low HBM use, and how should operator lowering be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3394", "title": "MI300X Operator Fusion for LLM Memory Bandwidth Optimization", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If fusion cuts memory traffic by 30% for a strictly memory-bound operator, what theoretical execution-time reduction should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3395", "title": "Optimizing LLM Inference on NVIDIA A100 with Custom AOT Graph Compiler", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an AOT graph compiler for an LLM to remove dynamic dispatch overhead and improve HBM utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3396", "title": "Optimizing Large Model Inference on H100 with AOT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an AOT graph compiler for H100 LLM inference using operator lowering and constant folding while balancing FP16 compute and HBM3 bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3397", "title": "Optimizing 7B LLM Inference on Google TPU v5e via AOT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use AOT compilation, operator lowering, and constant folding to hit 50 ms per token for a 7B LLM on one TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3398", "title": "Optimizing LLM Inference on AMD MI300X: Memory-Bound Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you reschedule operators to improve memory reuse, parallel execution, and layer fusion for the 70B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3399", "title": "NVIDIA A100 Operator Scheduling for LLM Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule operators and handle dynamic batching on GPUs to minimize latency and maximize throughput for LLM inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3400", "title": "Optimizing Transformer Operator Scheduling on NVIDIA A100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an operator schedule that reduces Transformer inference latency and peak HBM memory use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3401", "title": "H100 Transformer Inference Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare Strategies A and B on the GPU for latency, throughput, and memory use, and which hardware limits matter most?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3402", "title": "Optimizing LLM Inference Scheduling on Google TPU v5e", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule LLM operators to minimize execution time under the 16 GB HBM and 1.6 TB/s bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3403", "title": "Optimizing LLM Inference on AMD MI300X via Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize operator execution order for the 70B LLM to reduce memory pressure and improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3404", "title": "A100 Transformer Scheduling for Memory and Throughput", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule operators to manage dependencies, reuse memory, fuse kernels, and reduce inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3405", "title": "Optimizing IO-Aware Attention on AMD MI300X for Large Sequence Models", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you adapt FlashAttention-style tiling and online softmax to a GPU for long-sequence LLM inference?", "chain_ids": ["cloud-chain-auto-014-05"], "chain_positions": {"cloud-chain-auto-014-05": 1}, "chain_tiers": {"cloud-chain-auto-014-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3406", "title": "H100 Memory Optimization for IO-Aware Attention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compute the H100 standard-attention sequence limit, derive FlashAttention's HBM footprint, and explain the bandwidth gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3407", "title": "Diagnosing IO-Bound FlashAttention on TPU v5e", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why 64x64 FlashAttention tiles are IO-bound on TPU v5e, and which metrics would guide you?", "chain_ids": ["cloud-chain-auto-014-08"], "chain_positions": {"cloud-chain-auto-014-08": 0}, "chain_tiers": {"cloud-chain-auto-014-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3408", "title": "Optimizing Large Language Model Attention on AMD MI300X with IO-Aware Tiling", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design FlashAttention-style tiled attention with online softmax on MI300X, and quantify the memory and throughput benefits?", "chain_ids": ["cloud-chain-auto-014-05"], "chain_positions": {"cloud-chain-auto-014-05": 2}, "chain_tiers": {"cloud-chain-auto-014-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3409", "title": "FlashAttention on A100: Optimizing for Sequence Length and Throughput", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign self-attention with FlashAttention-style tiling and online softmax to support long sequences on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3410", "title": "Designing IO-Aware Attention on NVIDIA A100: FlashAttention and Online Softmax for Large Language Models", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use FlashAttention-style tiling and online softmax on the A100 for 16,384-token, 8,192-hidden LLM attention?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3411", "title": "Optimizing LLM Inference Latency with Speculative Decoding on NVIDIA H100", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does draft-verify speculative decoding use H100 bandwidth and FP16 compute, and how would you choose the draft model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3412", "title": "TPU v5e Speculative Decoding Performance Analysis", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate TPU v5e draft model choices for speculative decoding to minimize streaming generation latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3413", "title": "High-Throughput Speculative Decoding on AMD MI300X for Low-Latency LLM Inference", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect speculative decoding on MI300X to co-locate draft and verify models, select drafts dynamically, and maximize acceptance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3414", "title": "Optimizing Speculative Decoding on NVIDIA A100 for LLM Inference", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and improve A100 speculative decoding when acceptance sometimes falls below 40% and draft overhead is high?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 1}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3415", "title": "Optimizing Speculative Decoding Latency on NVIDIA A100 for LLMs", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What bottleneck likely keeps 70B speculative decoding at 150 ms per token on A100, and what quantifiable fix would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3416", "title": "Analyzing Data Parallelism Bottlenecks on Google TPU v5e", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does FSDP data-parallel ResNet-200 training on TPU v5e degrade beyond 64 devices, and how would you improve scaling?", "chain_ids": ["cloud-chain-auto-013-09"], "chain_positions": {"cloud-chain-auto-013-09": 2}, "chain_tiers": {"cloud-chain-auto-013-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3417", "title": "H100 Tensor Parallelism and Communication Overlap", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What communication challenges does column-partitioned tensor parallelism create on H100s, and how would you overlap communication with compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3418", "title": "Optimizing Large Language Model Inference with Tensor Parallelism on Google TPU v5e", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design TPU v5e tensor parallelism for a 175B LLM to overlap communication and computation and reduce latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3419", "title": "Scaling a 500B Parameter LLM on AMD MI300X with Hybrid Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you combine tensor and pipeline parallelism across GPUs for a 500B LLM to balance memory, communication, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3420", "title": "Diagnosing Tensor Parallelism Bottlenecks on NVIDIA GPUs for LLMs", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose tensor-parallel underperformance characterized by low compute utilization and heavy inter-GPU communication across 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3421", "title": "Designing Efficient Tensor Parallelism for Large Language Models on Google TPU v5e", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you shard a 100B LLM across TPU v5e devices with tensor parallelism to minimize inference latency and communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3423", "title": "Optimizing Pipeline Parallelism on Google TPU v5e", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you split the model across TPU pipeline stages and use micro-batching and interleaving to reduce bubble overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3424", "title": "Optimizing LLM Pipeline Parallelism on AMD MI300X with Micro-batching and Interleaving", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do pipeline bubbles arise with 8 stages at batch size 1, and how would micro-batching and interleaving improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3425", "title": "Optimizing LLM Inference on H100 with Pipeline Parallelism", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What micro-batch size maximizes throughput for the 4-stage pipeline with a total batch size of 64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3426", "title": "Diagnosing TPU Pipeline Stalls in Large Language Models", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of these unexpected stalls and massive pipeline bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3427", "title": "Choosing Micro-Batch Count for an 8-Stage A100 Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What micro-batch count m brings the bubble fraction below 10% while keeping the 1F1B activation memory footprint inside the 80 GB HBM budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3428", "title": "Optimizing LLM Pipeline Parallelism on NVIDIA H100 Cluster", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you schedule an 8-stage H100 pipeline for a terabyte-scale MoE model to minimize bubbles and exploit compute and bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3429", "title": "Pipeline Bubble on a Heterogeneous TPU v5e Stage Layout", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For an 8-stage TPU v5e pipeline with a 175 ms stage-8 vs 100 ms stage 1-7 and m=4 micro-batches, what is the bubble fraction and what schedule change brings utilization to within 10% of ideal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3431", "title": "3D Parallelism Sizing — Adam State + Activations on A100s", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Decompose how DP, TP, and PP affect the per-GPU memory footprint, and choose the minimum (TP, PP) to fit.", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 0}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3432", "title": "Optimizing Frontier Model Training with 3D Parallelism on NVIDIA H100", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why might 3D-parallel training of a 1T-parameter model on H100s stall during pipeline communication despite high HBM bandwidth?", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 2}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3433", "title": "Optimizing 3D Parallelism for Frontier LLM Training on AMD MI300X", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you combine data, tensor, and pipeline parallelism on the cluster for a 1T-parameter LLM with a 2 PB training footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3434", "title": "Optimizing Frontier Model Training with 3D Parallelism on NVIDIA A100", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design 3D parallel training for a 500B FP16 model on 8-GPU A100 nodes while managing memory and communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3435", "title": "Designing 3D Parallelism for Frontier Models on NVIDIA H100", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you integrate data, tensor, and pipeline parallelism on H100s to train trillion-parameter models efficiently?", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 3}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3437", "title": "Designing Efficient Gradient Synchronization for Large-Scale LLM Training on AMD MI300X", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a gradient synchronization strategy for a 1T-parameter LLM data parallelism to minimize network bottlenecks while preserving convergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3439", "title": "Optimizing Gradient Synchronization for LLM Training on H100 Clusters", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which gradient synchronization strategy would you use for 175B-parameter training on 8 H100s in one NVLink server versus two 4-GPU servers over 200 Gb/s Ethernet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3442", "title": "NVIDIA H100 Multi-tenancy with MIG for Resource Sharing", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does MIG on a GPU enable multi-tenant GPU sharing, and what resource guarantees and isolation does each instance provide?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 0}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3443", "title": "Google TPU v5e: Multi-Tenant Scheduling and Resource Contention", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are the RL and LLM jobs interfering on shared TPU v5e devices, and how should the scheduler prevent preemption and throughput instability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3445", "title": "A100 MIG Instance Utilization and Concurrency for ML Inference", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For a 1g.10gb A100 MIG slice, how many 20 TFLOPS and 100 GB/s inference requests fit concurrently, and what is the total across seven slices?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3446", "title": "Multi-Tenant H100 GPU Scheduling for Cloud ML", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule and partition GPUs for mixed distributed training and latency-critical inference while ensuring utilization, fairness, and SLOs?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 3}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3447", "title": "TPU v5e Multi-Tenant Scheduling and Preemption", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule the 2 GB, 20 TFLOPS inference job onto the occupied TPU v5e while minimizing disruption, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3449", "title": "Inference Tail-Latency vs Training Throughput Contention on TPU v5e", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you eliminate the 9 ms ICI AllReduce tail that blows the inference SLO on a shared TPU v5e pod, and what does it cost the training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3451", "title": "NVIDIA Collective Communication: Ring vs. Tree AllReduce Latency", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 10 GB AllReduce over 8 GPUs with 600 GB/s bandwidth, what are the theoretical minimum communication times for bandwidth-optimal ring and tree algorithms, and why are they similar?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3452", "title": "Distributed LLM Training on AMD MI300X: Interconnect Bottlenecks", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might a fat-tree interconnect outperform a torus or Dragonfly topology for MI300X LLM all-reduce, and what topology bottlenecks limit scalability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3453", "title": "Designing High-Performance Interconnects for Large-Scale A100 Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What interconnect architecture would you choose for 128 GPUs with communication-heavy training, and how would NVLink, NVSwitch, InfiniBand, and topology factor in?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3454", "title": "Optimizing MoE LLM Training on H100s: Diagnosing and Fixing Interconnect Bottlenecks", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix the MoE expert-dispatch bottleneck on 64 GPUs, and quantify the expected throughput improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3455", "title": "Optimizing LLM Training on AMD MI300X Clusters: Mitigating Network Bandwidth Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate network bandwidth bottlenecks causing underutilized MI300X accelerators in multi-node LLM training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3456", "title": "Optimizing Distributed ML Training with Network Bandwidth Constraints on NVIDIA A100s", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the data flow and communication patterns to mitigate network bandwidth bottlenecks, analyze the communication-computation ratio, and model the bandwidth cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3457", "title": "Optimizing Distributed Training on NVIDIA H100: A Network Bandwidth Challenge", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the communication-computation ratio for 1T-parameter FP16 training on H100s and mitigate the 400 Gbps inter-node bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3458", "title": "Optimizing Network-Bound LLM Training on AMD MI300X Cluster", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and address all-reduce bottlenecks for a 70B FP16 LLM with 100 GB/s effective inter-node bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3459", "title": "Optimizing Distributed Training under Network Bandwidth Bottlenecks on A100 Clusters", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically analyze, quantify, and mitigate the network bandwidth bottleneck for this MoE model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3460", "title": "Optimizing Distributed LLM Training on NVIDIA H100 Network Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you model bisection bandwidth, communication-computation ratio, and cost impact for 64 H100s stalled on AllReduce, and what optimizations would you take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3461", "title": "Optimizing Inter-GPU Communication for Distributed LLM Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What network and data-movement architecture would you use so multi-node LLM training avoids inter-GPU communication bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3462", "title": "Optimizing Distributed LLM Training with RDMA on NVIDIA A100", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a RoCE/RDMA network and communication strategy for a 100 GB all-reduce so GPUs stay compute-bound?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 2}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3463", "title": "Comparing High-Performance Interconnects for Distributed Training on Google TPU v5e", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do TCP/IP Ethernet and InfiniBand RDMA compare for TPU v5e all-reduce in latency, CPU overhead, memory copies, and training throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3464", "title": "Optimizing Distributed Training with RDMA on AMD MI300X", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use RDMA, zero-copy networking, and kernel bypass to make gradient synchronization high-throughput and low-latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3465", "title": "Optimizing Distributed Training Throughput with RoCE on GPUs", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reconfigure the RoCE v2 communication stack so the 64 GPUs use RDMA and GPUDirect for the 100 GB all-reduce instead of TCP/IP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3466", "title": "Optimizing Distributed Training Communication with RDMA on NVIDIA H100", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you leverage RDMA, InfiniBand verbs, zero-copy networking, and kernel bypass to maximize communication throughput for an all-reduce operation?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 3}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3467", "title": "TPU v5e Inference Load Balancing Basics", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What basic load balancing algorithms would you use to distribute variable inference requests across TPU v5e instances?", "chain_ids": ["cloud-chain-auto-secondary-013-32"], "chain_positions": {"cloud-chain-auto-secondary-013-32": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3468", "title": "Optimizing LLM Inference Routing on AMD MI300X Cluster", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you replace round-robin with dynamic, load-aware routing to reduce tail latency and balance utilization across the MI300X servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3469", "title": "Designing a High-Throughput Inference Load Balancer for NVIDIA A100 Clusters", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design request routing for this service considering consistent hashing, weighted round-robin, and dynamic traffic management to meet a strict P99 latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3470", "title": "LLM Inference Load Balancing on TPU Fleet", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route and load-balance 50,000 QPS across TPU inference servers to hit 100 ms latency during updates and failures?", "chain_ids": ["cloud-chain-auto-secondary-013-32"], "chain_positions": {"cloud-chain-auto-secondary-013-32": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-32": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3471", "title": "LLM Inference Routing on AMD MI300X Cluster", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you load-balance and route 10,000 RPS across 500 instances while handling variable requests, updates, and failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3472", "title": "NVIDIA A100 Inference: Load Balancing for Distributed ML Systems", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate WRR versus locality-aware consistent hashing for 10 A100s across two data centers to handle traffic spikes and GPU failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3473", "title": "LLM Inference Scaling: H100 Load Balancing & Routing for P99 Latency", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the request routing and load balancing strategy to meet these requirements, considering the specific characteristics of the hardware and inference traffic patterns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3474", "title": "Optimizing Inference Routing for High-Throughput TPU v5e Farms", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you replace round-robin with adaptive routing for TPU v5e inference to improve utilization and latency under fluctuating load?", "chain_ids": ["cloud-chain-auto-secondary-013-32"], "chain_positions": {"cloud-chain-auto-secondary-013-32": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3475", "title": "TPU v5e Network Congestion Control", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Google TPU v5e manage network congestion, and how does its approach differ from ECN, PFC, and DCQCN in RoCE GPU clusters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3476", "title": "Congestion Control in AMD MI300X GPU Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do incast congestion and weak flow control stall all-reduce jobs, and how do ECN, PFC, and DCQCN mitigate this in RoCEv2?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 1}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3477", "title": "Mitigating Incast Congestion in GPU Clusters for Distributed ML Training", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you mitigate severe incast congestion during all-reduce using ECN, PFC, DCQCN, and intelligent flow scheduling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3478", "title": "H100 Cluster Congestion Control with DCQCN", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you apply DCQCN to reduce incast during 128-H100 all-reduce, and how would you calculate the resulting per-H100 effective HBM3 bandwidth?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 0}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3479", "title": "Evaluating Congestion Control Strategies in an A100 GPU Cluster", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which proposal would you choose for the 512-GPU incast problem, and what are the trade-offs of DCQCN versus PFC/ECN with flow scheduling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3480", "title": "H100 Cluster Congestion Control and Network Optimization for Distributed ML", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use ECN, PFC, DCQCN-style control, and adaptive scheduling to fix tail-latency spikes in H100 collectives?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 3}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3481", "title": "Optimizing All-Reduce for LLM Training on AMD MI300X Cluster with Congestion Control", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you identify and resolve MI300X all-reduce incast congestion when standard ECN and PFC are insufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3482", "title": "NVIDIA A100 Inference Server Cold Start Analysis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the FP32 and FP16 cold-start model-loading times from the 10 GB/s NFS onto the GPU, and what is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3483", "title": "Scaling LLM Inference on NVIDIA H100 for Real-time Applications", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design H100 LLM inference serving for 1000 QPS and 50 ms p99, including loading, batching, autoscaling, and cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3484", "title": "Scaling High-Throughput LLM Inference on Google TPU v5e", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design TPU v5e LLM inference serving to handle unpredictable traffic, cold starts, model loading, batching, and autoscaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3485", "title": "MLOps Challenges with LLM Deployment on AMD MI300X", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might MI300X LLM inference show staging-only latency spikes despite strong hardware, and how would you diagnose and fix the bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3486", "title": "A100 Large Model Deployment Throughput", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the theoretical maximum inferences per second on one GPU if each inference moves 15 GB through a 2.0 TB/s HBM link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3487", "title": "Real-time LLM Deployment on Google TPU v5e with MLOps", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an end-to-end MLOps pipeline to ensure TPU v5e hardware constraints are validated automatically and training-serving consistency is maintained?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3488", "title": "Optimizing ML Workloads on Kubernetes with Google TPU v5e", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Kubernetes device plugins, node affinity, and scheduling to run TPU v5e training efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3489", "title": "Kubernetes Orchestration for High-Performance ML Training with NVIDIA A100", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the Kubernetes configuration, including GPU device plugins, node affinity, and job scheduling policies, to maximize utilization and minimize turnaround times?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3490", "title": "Kubernetes Pod Placement and Resource Allocation for Large Language Model Training on NVIDIA H100s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you configure Kubernetes pods, GPU resources, and node affinity for an 8-H100 distributed LLM training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3491", "title": "Diagnosing Underutilized Google TPU v5e in Kubernetes ML Training", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely root cause of the low TPU utilization and the `OOM` errors, and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3492", "title": "Kubernetes Orchestration for LLM Training with AMD MI300X", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you build a Kubernetes training platform for MI300X LLM workloads with ROCm devices, topology-aware affinity, fair scheduling, and low fragmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3493", "title": "Optimizing Distributed ML Training on Kubernetes with A100 GPUs", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule a 16-A100 distributed training job onto only A100-80GB nodes using Kubernetes device plugins, node affinity, and batch scheduling?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3494", "title": "Optimizing MoE Training on Kubernetes with Google TPU v5e", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose TPU v5e underutilization and HBM issues for a distributed MoE model, and optimize the Kubernetes deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3495", "title": "Optimizing LLM Training on Kubernetes with AMD MI300X", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the MI300X LLM training job below 50% GPU utilization, and what concrete Kubernetes and data-pipeline fixes would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3496", "title": "Optimizing Distributed ML Training on Kubernetes with NVIDIA A100s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose low GPU utilization and implement Kubernetes/NVIDIA scheduling strategies for topology-aware distributed training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3497", "title": "A100 Model Deployment: Canary Release with Latency Budget", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many NVIDIA A100 GPUs would you need to provision for the canary cluster to handle its allocated traffic while respecting the performance improvements and latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3498", "title": "Diagnosing Latency Spikes and OOM During Canary Rollout on NVIDIA H100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically diagnose the root cause of these seemingly contradictory symptoms (low utilization and OOM) and stabilize the system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3499", "title": "Progressive Rollout of a Large Language Model on Google TPU v5e", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a progressive rollout strategy—incorporating shadow, canary, traffic splitting, and automated rollbacks—optimized for TPU v5e constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3500", "title": "Progressive Rollout of a Large Language Model on AMD MI300X", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you progressively roll out the 150 GB LLM on MI300X while monitoring performance and enabling rapid rollback?", "chain_ids": ["cloud-chain-auto-secondary-016-08"], "chain_positions": {"cloud-chain-auto-secondary-016-08": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3501", "title": "Progressive Rollout of a Large Language Model on AMD MI300X Cluster", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What progressive rollout strategy would you use to deploy the larger LLM with high availability and fast rollback?", "chain_ids": ["cloud-chain-auto-secondary-016-08"], "chain_positions": {"cloud-chain-auto-secondary-016-08": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3502", "title": "Optimizing Multi-Model RAG Latency on NVIDIA A100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the A100 RAG deployment to keep end-to-end latency under 500 ms while routing, batching, and minimizing inter-model transfer overhead?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 5}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3503", "title": "Optimizing Checkpointing for Large Scale LLM Training on Clusters", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose checkpoint frequency, contents, and method for a weeks-long 1000-GPU LLM training run with frequent failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3504", "title": "TPU-Powered Recommendation System: Diagnosing Data & Concept Drift", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the recommendation quality drop on TPU v5e, distinguishing data drift, concept drift, and training-serving skew at scale?", "chain_ids": ["cloud-chain-auto-secondary-017-11"], "chain_positions": {"cloud-chain-auto-secondary-017-11": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3505", "title": "Real-time ML Output Drift Detection on AMD MI300X", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build real-time drift detection for 1024-D FP16 embeddings on MI300X while minimizing serving overhead?", "chain_ids": ["cloud-chain-auto-secondary-017-11"], "chain_positions": {"cloud-chain-auto-secondary-017-11": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3506", "title": "Real-time Data Drift Detection for Transformer Models on NVIDIA A100 Architectures", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement real-time PSI or KL drift detection for both serving architectures, and what are the trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-017-12"], "chain_positions": {"cloud-chain-auto-secondary-017-12": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3507", "title": "Real-time LLM Data Drift Detection on NVIDIA H100", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you size, implement, and integrate real-time input-drift detection for an LLM with minimal latency impact?", "chain_ids": ["cloud-chain-auto-secondary-017-12"], "chain_positions": {"cloud-chain-auto-secondary-017-12": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3508", "title": "Real-time Recommendation System Drift on AMD MI300X", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate data drift, concept drift, and training-serving skew in the recommendation engine?", "chain_ids": ["cloud-chain-auto-secondary-017-11"], "chain_positions": {"cloud-chain-auto-secondary-017-11": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3509", "title": "Google TPU v5e Inference Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you gracefully degrade a TPU v5e LLM service running at 95% utilization to reduce latency and inference failures?", "chain_ids": ["cloud-chain-auto-secondary-015-17"], "chain_positions": {"cloud-chain-auto-secondary-015-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3510", "title": "Graceful Degradation for LLM Inference on AMD MI300X", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a graceful degradation ladder for MI300X fraud detection, including fallbacks, fail-safe/fail-operational modes, and QoS shedding?", "chain_ids": ["cloud-chain-auto-secondary-015-18"], "chain_positions": {"cloud-chain-auto-secondary-015-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3511", "title": "Graceful Degradation Anomaly on TPU v5e", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What likely explains the 15% latency increase and 2% relevance drop despite green health checks, and how would you diagnose it?", "chain_ids": ["cloud-chain-auto-secondary-015-17"], "chain_positions": {"cloud-chain-auto-secondary-015-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3512", "title": "Graceful Degradation for Large Language Models on AMD MI300X", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement a degradation ladder and model fallbacks to maintain LLM service functionality under extreme stress on MI300X accelerators?", "chain_ids": ["cloud-chain-auto-secondary-015-18"], "chain_positions": {"cloud-chain-auto-secondary-015-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3513", "title": "Graceful Degradation on NVIDIA A100: Adapting to Load Spikes", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design graceful degradation for an overloaded GPU fraud model using fallbacks, fail-operational modes, and QoS shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3514", "title": "Graceful Degradation for Real-time ML Inference on NVIDIA H100", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you keep the real-time inference service fail-operational under reduced capacity using model fallbacks and QoS shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3516", "title": "Graceful Degradation for GenAI on AMD MI300X", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you gracefully degrade the sparse MoE service under latency spikes or HBM errors while preserving reliability and resource efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3517", "title": "Identifying Model Extraction Risks on NVIDIA A100 Deployments", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Describe the concept of a model extraction attack in this context and explain how you would protect the A100-hosted API.", "chain_ids": ["cloud-chain-auto-secondary-015-22"], "chain_positions": {"cloud-chain-auto-secondary-015-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3518", "title": "Designing a Robust and Reliable ML System against Adversarial Attacks on Google TPU v5e", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the TPU v5e real-time ML system to remain reliable against poisoning and evasion attacks?", "chain_ids": ["cloud-chain-auto-secondary-015-23"], "chain_positions": {"cloud-chain-auto-secondary-015-23": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3519", "title": "Diagnosing Covert Adversarial Perturbations on LLMs within an NVIDIA A100 Cloud Fleet", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of these subtle output anomalies and mitigate a potential adversarial attack?", "chain_ids": ["cloud-chain-auto-secondary-015-22"], "chain_positions": {"cloud-chain-auto-secondary-015-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3520", "title": "Autonomous Systems: Adversarial Robustness and Reliability", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the image recognition service to resist adversarial inputs and prompt injection while preserving low-latency throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3521", "title": "TPU v5e Adversarial Defense Throughput Impact for Real-time Fraud Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What overhead does a 1 GFLOP sanitization step add to a 100 GFLOP TPU inference, and how would you manage its impact?", "chain_ids": ["cloud-chain-auto-secondary-015-23"], "chain_positions": {"cloud-chain-auto-secondary-015-23": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3522", "title": "Adversarial Robustness on AMD MI300X in Cloud Deployments", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design edge detection and MI300X-side defenses against adversarial perturbations, prompt injection, and side-channel attacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3523", "title": "Real-time Observability for H100-powered ML Inference Service", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build telemetry, alerts, dashboards, MTBF/MTTR tracking, and straggler detection for the H100 inference cluster?", "chain_ids": ["cloud-chain-auto-secondary-017-16"], "chain_positions": {"cloud-chain-auto-secondary-017-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3524", "title": "High-Performance ML Inference Monitoring: H100 Architecture Comparison", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor these two H100 architectures, including GPU metrics, stragglers, tail latency, and cross-GPU communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3525", "title": "Diagnosing Stragglers in Real-time LLM Inference on AMD MI300X Cluster", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the p99 latency spikes and quantify the reliability improvement from your fix?", "chain_ids": ["cloud-chain-auto-secondary-017-16"], "chain_positions": {"cloud-chain-auto-secondary-017-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3526", "title": "A100 Inference Performance Degradation: Monitoring & Anomaly Detection", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a monitoring strategy to detect, diagnose, and minimize MTTR for these straggler requests?", "chain_ids": ["cloud-chain-auto-secondary-017-16"], "chain_positions": {"cloud-chain-auto-secondary-017-16": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3527", "title": "NVIDIA A100 Data Ingestion Bottleneck for Large Language Models", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is GPU utilization only 40-50%, and which ETL/data-loading optimizations would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3528", "title": "NVIDIA H100 and Feature Freshness in Online Feature Stores", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is point-in-time correctness, and how would you ensure it in the online feature store for this high-performance fraud model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3529", "title": "Optimizing Feature Freshness for Real-time Inference on Google TPU v5e", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you keep features under 500 ms fresh and point-in-time correct across online and offline stores for the TPU v5e recommender?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3530", "title": "Real-time Feature Freshness and Throughput for Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the recommendation feature pipeline to keep features fresh within 500 ms and point-in-time correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3531", "title": "Diagnosing Stale Features in Real-time H100 Model Deployment", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of these stale features impacting the real-time model given GPU under-utilization and a healthy feature store?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3532", "title": "Evaluating Feature Store Architectures for Real-time Fraud Detection on NVIDIA GPUs", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What feature-store architecture would you recommend for sub-10 ms fraud inference, and how do A and B trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3533", "title": "Real-time Feature Serving with H100-powered Embeddings", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the embedding generation and feature serving architecture to guarantee freshness, point-in-time correctness, and scalability to millions of QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3534", "title": "Real-time Fraud Detection with AMD MI300X: Feature Store & Point-in-Time Correctness", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design online and offline feature stores to feed the fraud model with millisecond-fresh, point-in-time-correct features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3535", "title": "Real-time Data Quality & Validation for H100 ML Systems", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time data quality gate before the training cluster to stop schema drift, invalid ranges, and anomalies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3536", "title": "Designing a High-Throughput Data Quality Gate for AMD MI300X", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What architecture and throughput calculation would ensure a 2 TB/hour data quality gate does not bottleneck the inference stream?", "chain_ids": ["cloud-chain-auto-secondary-015-28"], "chain_positions": {"cloud-chain-auto-secondary-015-28": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3537", "title": "Real-time Data Quality & Validation for A100-Accelerated Anomaly Detection", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a low-latency ingestion-time data quality gate so bad trading records never reach the A100 inference engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3538", "title": "Optimizing TPU v5e Utilization with Data Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose data-quality causes of TPU v5e throughput drops and quantify the gain after adding validation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3539", "title": "Ensuring Data Integrity for Petabyte-Scale LLM Training on AMD MI300X Clusters", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a continuous data quality framework for MI300X LLM training that handles schemas, contracts, gates, lineage, and anomalies?", "chain_ids": ["cloud-chain-auto-secondary-015-28"], "chain_positions": {"cloud-chain-auto-secondary-015-28": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-28": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3540", "title": "Optimizing Dataset Curation for Bias Mitigation on Google TPU v5e", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the TPU v5e's 16 GB HBM, BF16 compute, and 1.6 TB/s bandwidth shape active-learning selection and annotation to reduce dataset bias?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3541", "title": "Optimizing Active Learning Dataset Curation on NVIDIA H100s", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With 10 GPUs, 100 annotators, and a 20% re-annotation rate, where is the bottleneck and how would you maximize labeled-image throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3542", "title": "TPU v5e Training Inefficiency: Diagnosing Data Curation Issues", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose whether active-learning selection, annotation quality, inter-annotator agreement, or bias is causing the training instability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3544", "title": "Optimizing Annotation Throughput on H100 for Large-Scale Dataset Curation", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Given a target of annotating 10 million images within 3 months, each requiring an average of 5 seconds of human labeling time, how would you evaluate centralized labeling versus distributed edge labeling while preserving IAA and limiting bias?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3545", "title": "Optimizing Active Learning for Large-Scale Model Training on Google TPU v5e", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you prioritize samples for annotation to maximize model performance gains while minimizing labeling costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3546", "title": "Real-time Anomaly Detection with AMD MI300X: Streaming Data Pipeline Analysis", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the stream-processing pipeline to meet a 50ms anomaly-alert SLA and avoid ingestion or feature bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3547", "title": "Real-time Anomaly Detection on High-Frequency Sensor Streams with NVIDIA H100", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the stream pipeline to process 10 GB/s telemetry and meet the 50 ms anomaly-detection SLA?", "chain_ids": ["cloud-chain-auto-secondary-015-14"], "chain_positions": {"cloud-chain-auto-secondary-015-14": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3548", "title": "Real-time Sensor Data Ingestion & ML Inference on AMD MI300X", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build the ingestion, feature-computation, and integration pipeline for 100,000 sensors emitting 1 KB every 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3549", "title": "Optimizing Real-time Inference on NVIDIA A100 for Streaming Data", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What batching or data-movement fix would address 50 ms P99 spikes and 30% GPU utilization in the 50k events/s anomaly stream?", "chain_ids": ["cloud-chain-auto-secondary-015-14"], "chain_positions": {"cloud-chain-auto-secondary-015-14": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3550", "title": "Real-Time Anomaly Detection on High-Throughput Sensor Data with Accelerators", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a resilient event-stream pipeline for 100M events/s with sub-100 ms anomaly alerts?", "chain_ids": ["cloud-chain-auto-secondary-015-14"], "chain_positions": {"cloud-chain-auto-secondary-015-14": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3551", "title": "Optimizing Data Storage for NVIDIA A100 Training: Parquet vs. TFRecord", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which storage format strategy—Parquet, TFRecord, or hybrid—would maximize training throughput for this recommendation dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3552", "title": "Optimizing Data Ingestion for H100-Powered Foundation Model Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign JSON-based storage and ingestion so H100s no longer spend 70% of training time waiting for data?", "chain_ids": ["cloud-chain-auto-secondary-007-03"], "chain_positions": {"cloud-chain-auto-secondary-007-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3553", "title": "Diagnosing HBM Underutilization in Sparse Feature Training on MI300X", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the training pipeline I/O-bound with zstd Parquet on S3, and what storage format strategy would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3554", "title": "H100 Data Ingestion: Optimizing Storage for Large-Scale ML", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign storage and ingestion for the 10 PB image-feature dataset to maximize H100 utilization?", "chain_ids": ["cloud-chain-auto-secondary-007-03"], "chain_positions": {"cloud-chain-auto-secondary-007-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3555", "title": "Optimizing Large-Scale ML Data Storage for AMD MI300X", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What storage formats, compression, indexing, and tiering would you use to feed PB-scale multimodal data to MI300X training efficiently?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3556", "title": "Optimizing Data Loading for Large-Scale A100 Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the S3 TFRecord data-loading bottleneck that is starving the training cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3557", "title": "Optimizing Large-Scale Model Training Data I/O for NVIDIA H100 Clusters", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design formats, compression, and storage tiers to keep H100s fed during petabyte-scale multimodal training?", "chain_ids": ["cloud-chain-auto-secondary-007-03"], "chain_positions": {"cloud-chain-auto-secondary-007-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3558", "title": "Optimizing Large-Scale Foundation Model Training on TPU v5e: Data Efficiency & Compute Constraints", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the naive 10 TB training approach inefficient on TPU v5e, and what data-efficiency strategies would you use?", "chain_ids": ["cloud-chain-auto-secondary-015-38"], "chain_positions": {"cloud-chain-auto-secondary-015-38": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-38": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3559", "title": "Data-Efficient LLM Training Design on AMD MI300X", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data selection and processing strategy would you use on MI300X to raise ICR and avoid model collapse for multi-petabyte training?", "chain_ids": ["cloud-chain-auto-secondary-015-39"], "chain_positions": {"cloud-chain-auto-secondary-015-39": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-39": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3560", "title": "Diagnosing Data Efficiency and Model Collapse on NVIDIA H100", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and address the H100 data-efficiency issues causing diminishing returns, forgetting, and poor ICR?", "chain_ids": ["cloud-chain-auto-secondary-015-40"], "chain_positions": {"cloud-chain-auto-secondary-015-40": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3561", "title": "Optimizing Large-Scale Foundation Model Training with Data Efficiency on Google TPU v5e", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data selection strategy would train the petabyte-scale model on the accelerators within 100 TPU-days while maximizing Information-Compute Ratio (ICR) and avoiding collapse?", "chain_ids": ["cloud-chain-auto-secondary-015-38"], "chain_positions": {"cloud-chain-auto-secondary-015-38": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-38": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3562", "title": "Optimizing Data Pruning for Large Language Model Training on AMD MI300X", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you approach identifying and implementing an effective data pruning strategy, considering the unique characteristics of the hardware?", "chain_ids": ["cloud-chain-auto-secondary-015-39"], "chain_positions": {"cloud-chain-auto-secondary-015-39": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-39": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3563", "title": "Optimizing LLM Training on A100s: Coreset vs. Synthetic Data for the Data Wall Problem", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you evaluate coreset selection versus synthetic data generation on A100s, and would you choose one or a hybrid strategy?", "chain_ids": ["cloud-chain-auto-secondary-015-41"], "chain_positions": {"cloud-chain-auto-secondary-015-41": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3564", "title": "Optimizing LLM Training Data Efficiency on NVIDIA H100", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you leverage data efficiency techniques to address I/O bottlenecks and optimize the Information-Compute Ratio (ICR)?", "chain_ids": ["cloud-chain-auto-secondary-015-40"], "chain_positions": {"cloud-chain-auto-secondary-015-40": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3565", "title": "Optimizing Data Efficiency on Google TPU v5e: Addressing Model Collapse in Large-Scale Foundation Models", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What quantifiable data-efficiency plan would you use to improve the Information-Compute Ratio (ICR) and reduce model-collapse risk at hundreds-of-TB scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3566", "title": "Federated Learning on TPU v5e: Communication and Non-IID Challenges", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural choices would you mitigate these challenges, leveraging the TPU v5e's specs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3567", "title": "Federated Averaging Optimization on NVIDIA A100 for Non-IID Data", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the primary bottleneck (communication vs. computation on A100), propose an optimization strategy, and quantify its expected impact?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 2}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3568", "title": "Optimizing Federated Learning on NVIDIA H100 for Non-IID Edge Devices", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the federated learning protocol to handle communication limits, non-IID clients, cross-device privacy, and high-performance server-side aggregation?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 3}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3569", "title": "DP-SGD and Privacy Budget on AMD MI300X", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does the epsilon privacy budget affect DP-SGD noise and model accuracy on the MI300X?", "chain_ids": ["cloud-chain-auto-secondary-009-13"], "chain_positions": {"cloud-chain-auto-secondary-009-13": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3570", "title": "Optimizing DP-SGD on NVIDIA A100 for Federated Learning Privacy-Utility Tradeoff", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you calibrate clipping, noise, and epsilon accounting for DP-SGD with epsilon=8 on A100s while preserving medical model utility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3571", "title": "Designing a DP-SGD System with NVIDIA H100 for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the federated DP-SGD architecture on H100s, including epsilon accounting, noise calibration, batch sizing, and privacy monitoring?", "chain_ids": ["cloud-chain-auto-secondary-009-12"], "chain_positions": {"cloud-chain-auto-secondary-009-12": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3572", "title": "Optimizing DP-SGD on Google TPU v5e with Privacy Budget Constraints", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you calibrate the noise scale (σ) per step to meet the privacy budget while considering hardware constraints?", "chain_ids": ["cloud-chain-auto-secondary-009-14"], "chain_positions": {"cloud-chain-auto-secondary-009-14": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3573", "title": "DP-SGD Misconfiguration on AMD MI300X: Utility Drop & Rapid Privacy Budget Consumption Diagnosis", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What root causes would you investigate for a 15% utility drop and rapid epsilon consumption in DP-SGD?", "chain_ids": ["cloud-chain-auto-secondary-009-13"], "chain_positions": {"cloud-chain-auto-secondary-009-13": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3574", "title": "DP-SGD Scaling on NVIDIA H100: Balancing Privacy and Performance", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does the DP-SGD noise scale and per-sample gradient computation impact utility, throughput, and resource utilization on the H100?", "chain_ids": ["cloud-chain-auto-secondary-009-12"], "chain_positions": {"cloud-chain-auto-secondary-009-12": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3575", "title": "DP-SGD Model Evaluation on Google TPU v5e: Optimizing Privacy-Utility Tradeoffs for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate client-side versus server-side DP-SGD on TPU v5e for privacy budget, utility, noise calibration, and throughput?", "chain_ids": ["cloud-chain-auto-secondary-009-14"], "chain_positions": {"cloud-chain-auto-secondary-009-14": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3576", "title": "DP-SGD Deployment on AMD MI300X: Budgeting Epsilon and Performance Impact", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you configure DP-SGD for the 70B LLM to meet epsilon=8, delta=1e-5 over 10 epochs while preserving throughput and quality?", "chain_ids": ["cloud-chain-auto-secondary-009-13"], "chain_positions": {"cloud-chain-auto-secondary-009-13": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3577", "title": "Optimizing DP-SGD on NVIDIA H100 for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and calibrate DP-SGD across H100s to meet epsilon=8.0 while preserving model utility and throughput?", "chain_ids": ["cloud-chain-auto-secondary-009-12"], "chain_positions": {"cloud-chain-auto-secondary-009-12": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3578", "title": "Fairness Metric Definitions on NVIDIA A100", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can you define and differentiate between 'demographic parity' and 'equalized odds' in this context?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3579", "title": "Fairness Evaluation and Root Cause Analysis on Large Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you investigate the generative model's toxic output disparities and use the H100 cluster to analyze and mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3580", "title": "Architecting a Real-time Fairness Evaluation System on Google TPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect real-time fairness monitoring on TPU v5e for demographic parity, equalized odds, and intersectional subgroups?", "chain_ids": ["cloud-chain-auto-secondary-014-32"], "chain_positions": {"cloud-chain-auto-secondary-014-32": 2}, "chain_tiers": {"cloud-chain-auto-secondary-014-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3581", "title": "Diagnosing Bias in a Large-Scale Model Deployed on NVIDIA A100 for Demographic Parity Failures", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the production demographic-parity gap in the loan approval model running on the A100?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 2}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3582", "title": "Fairness Evaluation of Large-Scale Models on Google TPU v5e", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you sample and process data on TPU v5e to compute subgroup TPR/FPR for equalized odds without degrading production SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3583", "title": "Fairness-Aware LLM Evaluation on AMD MI300X", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare sequential versus distributed fairness evaluation on MI300X while preserving deep intersectional subgroup analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3584", "title": "Optimizing Fairness-Aware Data Pipelines on NVIDIA H100 for LLM Content Moderation", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and optimize the data pipeline to improve subgroup fairness evaluation and latency on the H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3585", "title": "Environmental Impact Disclosure for LLM on NVIDIA H100", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What environmental footprint disclosures and energy or carbon guardrails should the H100 model card include?", "chain_ids": ["cloud-chain-auto-secondary-015-34"], "chain_positions": {"cloud-chain-auto-secondary-015-34": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3586", "title": "TPU v5e Deployment and Responsible AI Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How could TPU v5e hardware behavior contribute to subtle demographic bias and affect Responsible AI guardrails?", "chain_ids": ["cloud-chain-auto-secondary-015-35"], "chain_positions": {"cloud-chain-auto-secondary-015-35": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3587", "title": "Guardrail Latency and Compute Budget for Responsible AI", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the minimum average FP16 TFLOPS rate the GPU must sustain while executing the guardrail model to meet the remaining latency budget.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3588", "title": "Diagnosing Emergent Bias in LLM on NVIDIA H100 Cluster", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you investigate the low-frequency bias using model cards, impact assessments, red-teaming, guardrails, and H100 telemetry?", "chain_ids": ["cloud-chain-auto-secondary-015-34"], "chain_positions": {"cloud-chain-auto-secondary-015-34": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3589", "title": "Designing a Responsible AI Governance Framework for High-Volume Credit Scoring on Google TPU v5e", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify a Responsible AI governance service for continuous impact assessments, red-teaming, and dynamic model cards?", "chain_ids": ["cloud-chain-auto-secondary-015-35"], "chain_positions": {"cloud-chain-auto-secondary-015-35": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3590", "title": "Responsible LLM Deployment on AMD MI300X with Bias Mitigation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you deploy responsibly on MI300X under launch pressure, including guardrails, red-teaming, model cards, and accountability?", "chain_ids": ["cloud-chain-auto-secondary-015-36"], "chain_positions": {"cloud-chain-auto-secondary-015-36": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3591", "title": "Ethical LLM Deployment on H100s: Performance vs. Responsible AI", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you integrate guardrails, model cards, and impact assessments while meeting 10,000 RPS and 200 ms p99?", "chain_ids": ["cloud-chain-auto-secondary-015-34"], "chain_positions": {"cloud-chain-auto-secondary-015-34": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-34": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3592", "title": "Mitigating LLM Bias on AMD MI300X: A Responsible AI Framework", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Responsible AI governance framework for the biased 175B financial LLM?", "chain_ids": ["cloud-chain-auto-secondary-015-36"], "chain_positions": {"cloud-chain-auto-secondary-015-36": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3593", "title": "Cost-Performance Diagnosis: LLM Training on Google TPU v5e", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose TPU v5e underutilization and reduce LLM training TCO without sacrificing throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3594", "title": "Optimizing LLM Deployment TCO on NVIDIA A100: Spot vs. Reserved Instance Strategy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build a 2-year A100 cloud TCO analysis and choose between on-demand, spot, reserved, or hybrid deployment?", "chain_ids": ["cloud-chain-auto-018-03"], "chain_positions": {"cloud-chain-auto-018-03": 2}, "chain_tiers": {"cloud-chain-auto-018-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3595", "title": "NVIDIA H100 Deployment Strategy: Optimizing TCO for LLM Training", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What H100 cloud procurement and workload strategy would reduce TCO for 60% predictable training and 150% burst spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3596", "title": "Little's Law Throughput Bound on H100 Inference Cluster", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Using Little's Law, what steady-state throughput corresponds to 480 requests in flight and 120 ms mean service time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3597", "title": "M/M/1 Queue Utilization and Mean Wait on A100 Serving Node", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the mean queue wait at λ=90 req/s and µ=100 req/s, and why is 90% utilization dangerous for tail latency?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 1}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3598", "title": "Tail Latency Amplification in Multi-Stage H100 Pipeline", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Explain tail latency amplification and why does a 4-stage pipeline with 50ms per-stage p99 show ~220ms end-to-end p99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3599", "title": "M/M/c Queue Design for TPU v5e Serving Pool", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many TPU servers (c) do you need, and what utilization target should you set to maintain a p99 latency under 25ms?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 1}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3600", "title": "Head-of-Line Blocking in LLM Decode Queue on MI300X", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What head-of-line blocking is caused by the 80% short and 20% long FIFO mix, and how would you fix it?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 1}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3601", "title": "Queuing Model for Prefill-Decode Disaggregation on H100", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using queuing theory, how would you split 26 GPUs between prefill and decode workers to maintain P99 latency SLAs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3602", "title": "Token Bucket vs Leaky Bucket Rate Limiting for H100 Serving", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do token bucket and leaky bucket rate limiting compare for 1000 req/s 100 ms bursts on a 600 req/s cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3603", "title": "Work-Conserving Scheduler Analysis for Mixed Priority on MI300X", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a preemptive priority queue for premium and standard traffic, and can premium meet p99<100 ms?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 2}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3604", "title": "Queueing Theory Mastery: SLA-Driven Capacity Planning for H100 Fleet", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What G/G/c capacity plan would meet p99 < 200ms and p999 < 1s for 50,000 req/s with a service-time CV=2.0?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 3}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3605", "title": "PUE Calculation for H100 Datacenter Power Budget", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the PUE for 1000 H100s drawing 700 W each in a 1.4 MW facility, and what does it mean for useful compute cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3606", "title": "Cooling Bottleneck Analysis for Dense H100 GPU Rack", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the rack power or cooling bottleneck for 8 HGX H100 nodes, and what rack density is actually usable?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 2}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3607", "title": "Stranded Capacity Identification in Mixed GPU Datacenter", "topic": "datacenter-efficiency", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is capacity stranded in the 200-rack datacenter, and how much H100 compute is unusable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3608", "title": "Power Distribution Unit Redundancy for H100 Training Cluster", "topic": "datacenter-efficiency", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 512-GPU 14-day training cluster, should the PDU architecture use 2N or N+1 redundancy, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3609", "title": "WUE and Water Efficiency Trade-off for Evaporative Cooling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With WUE=2.5 L/kWh and a 50,000 L/day cap, what maximum IT load is allowed and what constraint dominates?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 2}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3610", "title": "Carbon-Aware Scheduling for H100 Training Jobs", "topic": "sustainability-carbon-accounting", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a carbon-aware job scheduler across Virginia and Oregon to cut CO2 emissions by 50% given their spare capacities?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3611", "title": "Power Capping Impact on Training Throughput for MI300X", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why can a 750 W to 600 W MI300X power cap cause a non-linear throughput loss, and what trade-off does it imply?", "chain_ids": ["cloud-chain-auto-021-03"], "chain_positions": {"cloud-chain-auto-021-03": 1}, "chain_tiers": {"cloud-chain-auto-021-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3612", "title": "Rack-Level Power Provisioning for TPU v5e Pod", "topic": "datacenter-efficiency", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you provision rack, row, and facility power with N+1 redundancy for the 256-chip TPU v5e pod?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3613", "title": "DCiE and Total Facility Efficiency for Multi-Tenant GPU Cloud", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does DCiE=83% mean at PUE=1.2, how is it related to PUE, and is it competitive for a GPU cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3614", "title": "Datacenter Efficiency Mastery: Full-Stack TCO Optimization for H100 Training Facility", "topic": "sustainability-carbon-accounting", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What cooling, power, and location architecture would minimize 5-year TCO for a 10 MW H100 training datacenter under the efficiency constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3615", "title": "Ring-AllReduce Bandwidth Calculation for H100 NVLink Cluster", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "For 7B FP32 gradients on 8 H100s over NVLink, how much data does ring-AllReduce transmit per GPU and what is the minimum sync time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3616", "title": "Ring vs Tree AllReduce Trade-off on Multi-Node H100 Cluster", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For synchronizing 52GB gradients on a 64-GPU cluster, does ring-AllReduce or tree-AllReduce perform better, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3617", "title": "Gradient Compression Accuracy-Bandwidth Trade-off on H100 Training", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which gradient compression strategy would you use to reduce the 45 s synchronization bottleneck, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3618", "title": "Gradient Synchronization Overlap with Computation on H100", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How does gradient bucketing overlap the 400 ms AllReduce with the 800 ms backward pass, and what speedup is possible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3619", "title": "AllReduce Algorithm Selection for Heterogeneous Bandwidth Topology", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What hierarchical AllReduce strategy should the 256-GPU cluster use for 40 GB gradients across NVLink and InfiniBand tiers?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 2}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3620", "title": "Gradient Staleness in Asynchronous SGD on A100 Cluster", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does τ=8 step gradient staleness in asynchronous SGD affect convergence, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3621", "title": "Reduce-Scatter and AllGather Decomposition on H100 NVLink", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For ZeRO Stage 2 on 8 H100s with a 16 GB gradient buffer, what are the Reduce-Scatter and AllGather volumes versus standard AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3622", "title": "Gradient Checkpointing Impact on Synchronization Frequency", "topic": "gradient-synchronization", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does gradient checkpointing change the compute-to-communication ratio and AllReduce overlap efficiency for the 30B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3623", "title": "AllReduce Topology Diagnosis: Ring vs Butterfly on TPU v5e Pod", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Diagnose why the measured TPU v5e AllReduce time is 2.1s when ring-AllReduce analysis predicts 800ms, and explain the interconnect property responsible.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3624", "title": "NCCL AllReduce Tuning for H100 Multi-Rail InfiniBand", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you tune NCCL_ALGO, NCCL_PROTO, and message thresholds to improve 128-GPU AllReduce bandwidth from 85% to 95%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3625", "title": "Gradient Synchronization Fluency: AllReduce Taxonomy", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How do AllReduce, AllGather, Reduce-Scatter, Broadcast, and Reduce differ in communication volume per rank and training use case?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3626", "title": "Gradient Synchronization Mastery: Pipeline Parallelism + AllReduce Co-Design", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you schedule tensor, pipeline, and data-parallel communication for the 1024-GPU 3D-parallel 175B training system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3627", "title": "RDMA Write vs Send Semantics on InfiniBand for H100 Parameter Server", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Should the parameter server use RDMA Write or RDMA Send for gradient pushes, and what are the key differences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3628", "title": "RoCEv2 vs InfiniBand Latency and Congestion on H100 Training Cluster", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the new training cluster use native InfiniBand or RoCEv2 over 200GbE, and how do they compare for ML training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3629", "title": "Zero-Copy RDMA Registration and Memory Pinning for GPU Direct", "topic": "rdma-transport", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might GPUDirect RDMA show 40% higher latency than CPU-path RDMA, and how would you fix the memory registration issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3630", "title": "Kernel Bypass Networking Overhead Analysis for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 100 MB gradient chunks over 25 GB/s links, when does RDMA's 3 µs latency matter versus TCP's 40 µs latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3631", "title": "InfiniBand Adaptive Routing for Hotspot Avoidance in H100 Fat-Tree", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you use adaptive routing to eliminate AllReduce hotspots in the 512-H100 fat-tree InfiniBand cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3632", "title": "RDMA QP (Queue Pair) Scalability Limits on Large H100 Clusters", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does one RDMA Queue Pair per peer fail to scale on 1024 H100s, and what connection strategy should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3633", "title": "GPUDirect Storage and RDMA for Checkpoint Loading on H100", "topic": "rdma-transport", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would GPUDirect Storage (GDS) meaningfully reduce 70B checkpoint save and restore times to NFS, and what improvement should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3634", "title": "RDMA Transport Fluency: IB Verbs API Key Operations", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you implement basic InfiniBand Verbs message passing, and what does zero-copy mean in that RDMA path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3635", "title": "RDMA Transport over Lossy Networks: RoCEv2 Congestion Collapse", "topic": "rdma-transport", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the RoCEv2 PFC storms during AllReduce, and how would you configure the network to prevent them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3636", "title": "InfiniBand vs Ethernet Cost-Performance Analysis for H100 Scale-Out", "topic": "rdma-transport", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which network should a 1024-H100 training cluster choose, HDR InfiniBand or 400GbE RoCEv2, given cost and performance trade-offs?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 2}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3637", "title": "RDMA Transport Mastery: End-to-End Network Design for 4096-H100 Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the network fabric to meet the 4096-GPU cluster's latency, all-reduce, fault-tolerance, and $50M budget requirements?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 3}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3638", "title": "InfiniBand Subnet Manager Failover for Training Cluster", "topic": "rdma-transport", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design InfiniBand Subnet Manager failover so a management-node crash does not kill the 7-day training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3639", "title": "RDMA Read vs Write Semantics for KV Cache Transfer in LLM Disaggregation", "topic": "rdma-transport", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Would you use RDMA Read or RDMA Write to transfer the 2GB KV cache between prefill and decode H100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3640", "title": "P99 Latency Spike Diagnosis on H100 Inference Cluster", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and reduce the recommender service's P99 latency from 180ms to under the 50ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3641", "title": "Hedged Requests for LLM Inference on A100 Fleet", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What hedge timeout and safeguards would you use to bring P99 below 3s without causing a load cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3642", "title": "Load Balancing Strategy to Minimize P999 on TPU v5e", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the BERT-large service route requests so the 0.1% long sequences stop driving P999 to 800ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3643", "title": "P99 vs P999 Tradeoffs in Batching Strategy", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you change the dynamic batching policy to meet P999 < 200ms without collapsing GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3644", "title": "Tail Latency SLO Decomposition Across Microservices", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you set per-stage latency SLOs so the 5-stage pipeline meets a true 100ms end-to-end P99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3645", "title": "Measuring and Setting Realistic P999 SLOs for Autoregressive Generation", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What SLA would you propose for 200-2000 token generations on the 70B A100 service, and how would you architect to meet it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3646", "title": "Flash Attention Tiling Strategy for H100 SRAM Constraints", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What Flash Attention tile sizes would you choose for seq_len=4096 and d_head=128, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3647", "title": "Flash Attention IO Complexity vs Standard Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What theoretical IO reduction should the team expect from FlashAttention-2 versus standard attention at seq_len=8192 on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3648", "title": "Flash Attention Backward Pass Memory Recomputation", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should you revert from Flash Attention to standard attention to reduce backward-pass FLOPs at seq_len=16384, and why?", "chain_ids": ["cloud-chain-auto-014-03"], "chain_positions": {"cloud-chain-auto-014-03": 2}, "chain_tiers": {"cloud-chain-auto-014-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3649", "title": "Multi-Query Attention Memory Savings vs Flash Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For batch-1 decode, should you use MHA with Flash Attention, MQA, or GQA to minimize latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3650", "title": "Flash Attention with Variable-Length Sequences and Padding Masks", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you recover Flash Attention’s expected speedup for padded variable-length batches at seq_len=2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3651", "title": "Flash Attention v2 vs v3 Block Size Selection on H100", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you adopt Flash Attention v3 over v2 on H100 for the 70B model, and for which workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3652", "title": "Flash Attention Numerical Stability with Long Sequences", "topic": "flash-attention", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you debug and fix NaNs from Flash Attention at seq_len=32768 on TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3653", "title": "Diagnosing Flash Attention Regression After Library Upgrade", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you investigate and fix the 25% throughput regression after upgrading flash-attn from 2.3 to 2.6?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3654", "title": "Kubernetes GPU Scheduling Fragmentation with H100 MIG", "topic": "container-orchestration", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the 14-GPU pod fail to schedule despite 16 MIG instances, and how would you fix the Kubernetes configuration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3655", "title": "HPA Scaling Latency for GPU Inference Pods", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you redesign autoscaling so the A100 inference service handles a 3x traffic spike before the HPA's 4-minute reaction time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3656", "title": "Multi-GPU Pod Affinity for NVLink-Dependent Workloads", "topic": "container-orchestration", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you schedule the 70B training pods so 8-GPU jobs stay within one node and avoid the 18x all-reduce slowdown?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3657", "title": "Container Image Size and Model Loading Latency on GPU Pods", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you reduce the 13B model pod cold start from 8 minutes to under 2 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3658", "title": "Resource Quota and GPU Memory Oversubscription in Kubernetes", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you prevent 70B MI300X workloads from OOM-killing other tenants when Kubernetes only tracks GPU count?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 2}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3659", "title": "Kubernetes Operator Design for Distributed Training Jobs", "topic": "container-orchestration", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you orchestrate the 64-GPU PyTorchJob so a single node preemption does not restart 3 hours of training from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3660", "title": "Custom Kubernetes Scheduler for GPU Memory-Aware Placement", "topic": "container-orchestration", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a Kubernetes scheduler plugin that places inference pods based on free GPU memory instead of GPU count?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 3}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3661", "title": "Kubernetes Network Policy for Secure Multi-Tenant GPU Inference", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you enforce network isolation between enterprise customers' pods across Kubernetes namespaces?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3663", "title": "Megatron-LM Column vs Row Parallelism for MLP Layers", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should W1 and W2 in the TP=4 MLP block be split between column and row parallelism, and what communication is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3664", "title": "Tensor Parallelism Communication Bottleneck on Multi-Node Cluster", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you change the TP=16 training setup to remove cross-node tensor-parallel all-reduces and recover MFU?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 2}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3665", "title": "Tensor Parallelism for Attention Heads: Optimal Head Distribution", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should you split Q and KV heads for Grouped-Query Attention (GQA) with 32 Q heads, 8 KV heads, and TP=4 on 4 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3666", "title": "Tensor Parallelism Correctness: Dropout and Random State Synchronization", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why can TP=4 training with dropout diverge from the single-GPU baseline despite the same seed, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3667", "title": "TP+PP Combined Parallelism Strategy for 540B Model", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What 3D parallelism strategy would you use to train the 540B model on 256 GPUs while targeting MFU > 40%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3668", "title": "Tensor Parallelism Activation Memory and Recomputation", "topic": "model-tensor-parallelism", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you fit batch_size=32 for the 30B TP=4 training run when activation memory is causing OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3669", "title": "Tensor Parallelism Embedding Layer Split and Vocabulary Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you tensor-parallelize the 128K-token embedding and LM head with TP=4, and what communication does it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3670", "title": "Tensor Parallelism Scaling Efficiency Measurement", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the scaling efficiency from TP=4 to TP=8, and what bottleneck explains the 2100 tokens/sec result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3671", "title": "Pod Disruption Budget Design for Zero-Downtime Model Rollouts", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you configure rolling updates so the 12-replica A100 deployment avoids P99 latency spikes during model rollout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3672", "title": "MoE Expert Routing Overhead", "topic": "mixture-of-experts", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce the 64-expert MoE router overhead from 12% on a 4096-token batch without reducing the expert count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3673", "title": "Expert Load Imbalance Killing Throughput in MoE", "topic": "mixture-of-experts", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes one GPU to hit 100% utilization while the others idle, and how would you fix the load imbalance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3674", "title": "Designing MoE Expert Sharding for TPU v5e", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you shard 128 1B-parameter experts across 64 TPU v5e chips, and what all-to-all cost should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3675", "title": "MoE Auxiliary Loss Tuning for Stable Training", "topic": "mixture-of-experts", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is expert collapse occurring after 5k steps with $\\alpha=0.01$, and how would you fix the MoE load-balancing loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3676", "title": "MoE Inference Batching Strategy for Throughput", "topic": "mixture-of-experts", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Should you pad low-rate MoE batches to 64 tokens on MI300X, or use another approach to improve utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3677", "title": "MoE vs Dense Model Memory Tradeoff at Scale", "topic": "mixture-of-experts", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the memory and compute tradeoffs of the 140B top-2 MoE versus the 70B dense model, and when would you prefer the MoE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3678", "title": "ECMP Hashing Imbalance in All-Reduce Traffic", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the ECMP hashing imbalance, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3679", "title": "PFC Deadlock Prevention in RoCE Cluster", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the RoCEv2/PFC configuration to prevent 45-second PFC-induced training stalls after topology changes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3680", "title": "Buffer Bloat Causing Gradient Staleness", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does switch buffer bloat cause >5-step gradient staleness, and what would you change to reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3681", "title": "Incast Collapse During Gradient Aggregation", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What failure mode is slowing the 512-worker parameter-server training job, and how would you fix the 32:1 fan-in?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3682", "title": "DCQCN Tuning for Large-Scale All-Reduce", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you tune DCQCN for the 400 GbE RoCEv2 cluster to stop the 90% to 40% throughput oscillation?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 2}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3683", "title": "Flow Completion Time vs Bandwidth Tradeoff in Gradient Sync", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why do large all-reduce gradients and small parameter-server control messages need different congestion-control treatment, and how would you support both?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3684", "title": "Multi-Tenant GPU Scheduling with SLO Guarantees", "topic": "scheduling-resource-management", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a scheduling system to guarantee Tenant A's SLO while maximizing Tenant B's GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3685", "title": "Bin-Packing GPU Jobs to Minimize Fragmentation", "topic": "scheduling-resource-management", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the cost of the 18% GPU fragmentation, and what allocation strategy would reduce over-provisioned 8-GPU jobs?", "chain_ids": ["cloud-chain-auto-021-07"], "chain_positions": {"cloud-chain-auto-021-07": 1}, "chain_tiers": {"cloud-chain-auto-021-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3686", "title": "Preemption Overhead in Long-Running Training Jobs", "topic": "scheduling-resource-management", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much overhead do the 4-hour preemptions and 18-minute NFS checkpoints add, and how would you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3687", "title": "Gang Scheduling for Distributed Training Efficiency", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is all-or-nothing gang scheduling necessary for the 30 concurrent 8-GPU jobs, and what scheduler behavior would fix utilization?", "chain_ids": ["cloud-chain-auto-021-05"], "chain_positions": {"cloud-chain-auto-021-05": 1}, "chain_tiers": {"cloud-chain-auto-021-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3688", "title": "SLO-Aware Autoscaling for Inference Under Bursty Load", "topic": "scheduling-resource-management", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you autoscale 70B LLaMA serving to survive 3x spikes lasting 2-5 minutes when H100 cold starts take 4 minutes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3689", "title": "Spot Instance Preemption Handling in Training Clusters", "topic": "scheduling-resource-management", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a fault-tolerant checkpointing strategy for 32 A100 spot training so a 2-minute interruption notice loses under 10 minutes of work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3690", "title": "GPipe Bubble Overhead Calculation for 8-Stage Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With 8 pipeline stages and 64 micro-batches, what is the pipeline bubble fraction and effective GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3691", "title": "Interleaved Pipeline Schedule to Reduce Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would an interleaved 1F1B schedule with virtual stages change the bubble fraction and memory use for the 8-stage, m=16 pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3692", "title": "Stage Imbalance in Heterogeneous Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much throughput is lost because Stage 4 is 35% slower, and how would you rebalance the 4-stage pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3693", "title": "Micro-batch Size Selection Under Memory Constraints", "topic": "pipeline-parallelism", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the maximum micro-batch size you can use with 8 micro-batches in flight?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 0}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3694", "title": "Combining Pipeline and Tensor Parallelism for 1T Parameter Model", "topic": "pipeline-parallelism", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What data, tensor, and pipeline parallelism factors would you choose for a 1T-parameter model on 1024 H100s under a 70GB HBM limit, and why?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 3}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3695", "title": "Online vs Offline Feature Store Consistency Tradeoffs", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are 0.8% of recommendation requests serving >24h-stale batch features, and how would you detect and mitigate that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3696", "title": "Feature Serving Latency Budget for Two-Tower Models", "topic": "feature-store-management", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign feature retrieval to bring p99 latency from 18ms under the 5ms budget for the 50ms two-tower SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3697", "title": "Feature Skew Between Training and Serving", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you confirm and fix the training-serving skew?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 2}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3698", "title": "Feature Store Freshness SLAs for Time-Sensitive Models", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What freshness SLA should each fraud feature tier have, and how would the feature store enforce those SLAs?", "chain_ids": ["cloud-chain-auto-020-15"], "chain_positions": {"cloud-chain-auto-020-15": 0}, "chain_tiers": {"cloud-chain-auto-020-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3699", "title": "Feature Store Versioning for Safe Model Rollouts", "topic": "feature-store-management", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you version features so the v2 recommendation model can roll out gradually while v1 continues serving safely?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3700", "title": "Speculative Decoding Draft Model Selection for 70B LLM", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What speedup should you expect from a 7B draft model for speculative decoding of the 70B target, and what acceptance rates make it worthwhile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3701", "title": "Priority-Based GPU Scheduling for Mixed Training and Inference", "topic": "scheduling-resource-management", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule inference and elastic training on the 256-GPU cluster to use overnight capacity without violating the p99 < 100ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3702", "title": "Roofline Ceiling Identification on H100 for Transformer MLP Blocks", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the 4096x4096x16384 BF16 GEMM on H100 compute- or memory-bound, and how large is the performance gap?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 3}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3703", "title": "Memory-Bound vs Compute-Bound Classification for Attention", "topic": "roofline-analysis", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 2048-token, 32-head QK^T attention operation compute- or memory-bound, and what should you use to improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3704", "title": "Flash Attention Arithmetic Intensity Analysis on H100", "topic": "attention-scaling", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How much memory traffic does FlashAttention-2 save versus materializing the attention matrix, and what bandwidth speedup results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3705", "title": "Multi-Query vs Multi-Head Attention KV Cache Trade-off", "topic": "attention-scaling", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do MHA, MQA, and 8-group GQA compare in KV-cache memory and decode throughput for batch 32 at 4096 context on H100 80GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3706", "title": "INT8 Quantization Calibration Strategy for LLM Inference on H100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the accuracy and hardware-efficiency tradeoffs of per-tensor versus per-channel INT8 quantization for the 13B LLaMA model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3707", "title": "FP8 vs INT8 Quantization Throughput on H100 for Training", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which FP8 formats should you use for forward and backward passes of the 7B transformer given gradients spanning 1e-7 to 1e3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3708", "title": "HBM3 vs L2 Cache Access Patterns for Transformer Weights on H100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Will the 402MB QKV weight matrix fit in L2 cache during batch-1 decode, and what per-token latency does HBM streaming imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3709", "title": "MI300X Unified Memory Architecture for Large Model Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a single MI300X serve the 70B FP16 model without tensor parallelism, and how does its decode throughput compare to 2x H100s with tensor parallelism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3710", "title": "Continuous Batching vs Static Batching Throughput on H100 for LLM Serving", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much throughput improvement should continuous batching provide over static batching for 2048-token slots with 500-token average requests?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 3}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3711", "title": "Prefill Batch Sizing for TTFT Optimization on A100", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At what batch size does TTFT SLA become violated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3712", "title": "H100 TDP Management Under Sustained Training Load", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 8 H100s drawing 680W each in a 40kW rack, how much thermal headroom remains and can you add a 9th GPU?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 3}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3713", "title": "Power vs Performance Trade-off with GPU Frequency Scaling on A100", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "If a GPU is power-capped from 400W to 300W, what frequency and throughput reduction follow after accounting for static power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3714", "title": "AllReduce Bandwidth Requirement for Ring All-Reduce on 8×H100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does ring all-reduce take for 52GB FP32 gradients across 8 H100s on NVLink, and can it overlap with backward compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3715", "title": "ReduceScatter vs AllReduce for Pipeline Parallelism on H100 Cluster", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do ZeRO-1 and ZeRO-3 communication patterns compare within the 2-way DP groups of the 8-stage pipeline for a 70B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3716", "title": "AllToAll Communication for Expert Parallelism in MoE on TPU v5e", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What AllToAll volume and latency does top-2 MoE token dispatch create for batch=512, seq_len=2048, token_dim=1024 across 64 TPU chips?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3717", "title": "Speculative Decoding Acceptance Rate Impact on H100 Inference Throughput", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What throughput improvement should speculative decoding deliver with k=5 and a=0.8 versus 30 tok/s target-only decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3719", "title": "Structured Pruning 50% of MLP Layers in 70B LLM — MI300X Impact", "topic": "pruning-sparsity", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What parameter reduction, memory savings, and decode throughput speedup result from pruning 30% of MLP neurons in the 70B model?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 2}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3720", "title": "Thermal Throttling During Long-Context Training on H100", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What throughput loss and recovery-time impact should you expect from throttling from 1.41GHz to 1.19GHz after the 5°C ambient rise?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3721", "title": "Compute-Optimal Scaling Laws Verification with Roofline on A100", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For a 50B model trained on 1T tokens across 1024 A100s, what are the training time, memory footprint, and roofline bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 4}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3722", "title": "GPTQ vs AWQ Quantization Quality on H100 for 70B LLM", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Given similar inference speed, would you ship GPTQ or AWQ for Llama-3-70B 4-bit production, and what trade-offs besides PPL matter?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3723", "title": "Tensor Parallelism GEMM Partitioning on H100 for LLM Layer", "topic": "model-serving-infrastructure", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the per-layer communication cost of 8-way tensor-parallel QKV/output projections, and how efficient is it versus ideal scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3724", "title": "Memory Bandwidth Bottleneck Diagnosis for LLM Decode on MI300X", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the MI300X deployment reaching only 63% HBM bandwidth at batch_size=4, and how would you investigate the gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3725", "title": "SLO Violation Root Cause Analysis for LLM Service on TPU v5e", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What likely causes P99 TTFT to jump to 2.1s while P50 stays 180ms on the TPU v5e Gemma-7B service, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3726", "title": "AllReduce Topology Comparison: Ring vs Tree on H100 IB Cluster", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare ring all-reduce, recursive halving-doubling, and a binary tree reduction: which minimizes wall-clock time for this payload?", "chain_ids": ["cloud-chain-auto-002-09"], "chain_positions": {"cloud-chain-auto-002-09": 0}, "chain_tiers": {"cloud-chain-auto-002-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3727", "title": "Batch Size Selection for Maximum MFU on A100 During Pre-Training", "topic": "batching-strategies", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does MFU peak at batch_size=128 and drop at 512, and which batch size is optimal for this training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3728", "title": "Carbon Footprint Estimation for LLM Training Run", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the total energy consumption, electricity cost, and carbon footprint for the 21.7-day, 1024-GPU training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3729", "title": "Benchmark MFU vs Achieved FLOPS for Attention vs MLP on H100", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does attention achieve only 31% MFU versus 67% for MLP GEMMs, and what changes would improve overall MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3730", "title": "Prefill-Decode Disaggregation Architecture Specification for H100 Fleet", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How many prefill vs decode GPUs would you use, how would you transfer KV cache, and how would you load-balance 1000 req/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3731", "title": "Multi-Level Memory Hierarchy Specification for 405B LLM Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tiered memory strategy or limits are required to serve 500 concurrent 8192-token users for Llama-3-405B FP8 on H100s?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 4}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3732", "title": "Roofline Model for Transformer Encoder vs Decoder on TPU v5e", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the arithmetic intensities and roofline bottlenecks for BERT-large encoder batches and GPT-2-large single-token decoding on TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3733", "title": "Gradient Checkpointing Break-Even Analysis for 70B Model on A100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does this 64x A100 tensor-parallel 70B training setup need activation checkpointing at batch=8, given the 6GB activation footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3734", "title": "Gradient Compression Impact on AllReduce Bandwidth on H100 Cluster", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What net throughput gain and accuracy risk should you expect from 99.9% Top-K gradient sparsification of 26GB gradients on 32 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3735", "title": "NCCL vs MPI AllReduce Performance for Mixed-Precision Training on H100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How long should NCCL hierarchical AllReduce and tree-based MPI reduction take for 52GB gradients on the 64-H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3736", "title": "Long-Context Attention with Ring Attention on H100 Multi-Node", "topic": "attention-scaling", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the KV communication volume per ring step, and what is the total ring communication time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3737", "title": "Inference Serving Cost Optimization: Batching Strategy for RAG", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you batch the 10,000 daily embeddings versus 1,000 daily generations to minimize compute cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3738", "title": "Kernel Fusion Impact on Memory Bandwidth for Layer Norm + GELU on H100", "topic": "model-serving-infrastructure", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What speedup should fusing LayerNorm and GELU deliver by reducing HBM traffic from 8GB to 4GB for the 2GB activation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3739", "title": "Disaggregated Serving with Chunked Prefill on A100 80GB", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you choose a chunked-prefill size to bound decode latency spikes from 2048-token prefills, and what TTFT trade-off results?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3740", "title": "Pipeline Bubble Analysis for 4-Stage Pipeline Parallelism on H100", "topic": "batching-strategies", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What pipeline bubble fraction and GPU utilization result from 4 stages and 4 micro-batches with 50ms forward plus 50ms backward per stage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3741", "title": "Dynamic Sparse Attention Pattern Implementation on H100 for Code LLM", "topic": "attention-scaling", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "On an H100, will dynamic sparse attention with 288 attended positions per token at 8K context be faster than dense FlashAttention, and why?", "chain_ids": ["cloud-chain-auto-secondary-013-05"], "chain_positions": {"cloud-chain-auto-secondary-013-05": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3742", "title": "ZeRO-3 Communication Volume Analysis for 175B Model on 256 A100s", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much ZeRO-3 AllGather communication occurs per forward pass for GPT-3 175B on 256 GPUs, and how does it compare to compute time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3743", "title": "H100 SM Occupancy Analysis for Small-Batch Inference Kernel", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What SM occupancy does this batch=1 GEMM achieve on H100, and why does launching only 32 thread blocks underutilize the GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3744", "title": "Activation Checkpointing Granularity Trade-off for 405B Model Training on H100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Analyze the memory-compute trade-off and determine whether you should use full or selective activation checkpointing.", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 3}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3745", "title": "PUE Reality Check: Cooling Overhead in a Hyperscale Cluster", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much total power does the facility draw, and how many megawatts are consumed purely by cooling, lighting, and power distribution overhead?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 1}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3746", "title": "Rack Power Density Limits with GPU-Dense Nodes", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many DGX nodes can you fit per rack, and what infrastructure changes are needed to reach 80 kW/rack density?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3747", "title": "Carbon-Aware Scheduling: Shifting Training to Low-Carbon Hours", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much CO2 can you realistically save by shifting 30% of the workload to low-carbon hours, and what are the engineering tradeoffs?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 1}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3748", "title": "The Cooling Cliff: Air vs Liquid at GPU Density", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the thermal challenge, and how do you solve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3749", "title": "Stranded Power: When GPU Utilization Tanks PUE", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the effective PUE when utilization is factored in, and how do you improve energy proportionality?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 3}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3750", "title": "TPU Pod Power Budgeting vs GPU Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which cluster is more power-efficient, and what architectural decisions drive the difference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3751", "title": "Water Usage Effectiveness: The Hidden Cost of Evaporative Cooling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much water does this datacenter consume daily, and why is WUE becoming a critical metric for AI infrastructure?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 1}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3752", "title": "Power Delivery: UPS Efficiency at Partial Load", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the annual energy wasted in UPS conversion losses, and how do you architect around this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3753", "title": "Compute Density vs Power: Planning a 100 MW AI Campus", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which cooling architecture would you recommend for a 100 MW campus with PUE ≤ 1.08 and 100 kW/rack density, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3754", "title": "Measuring True Energy per Token in Training", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the energy per token for the 512-H100, 2-week, 2T-token run, how does it compare to benchmarks, and how would you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3755", "title": "Thermal Throttling: When GPUs Self-Protect", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What causes the periodic step time increase to 3.5 seconds, and how do you resolve it?", "chain_ids": ["cloud-chain-auto-021-03"], "chain_positions": {"cloud-chain-auto-021-03": 0}, "chain_tiers": {"cloud-chain-auto-021-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3756", "title": "GPipe Bubble Overhead: The Pipeline Efficiency Tax", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What pipeline bubble fraction results from partitioning 40 layers across 4 GPUs with 8 micro-batches under GPipe?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 1}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3757", "title": "1F1B vs GPipe: Memory Advantage of Interleaved Scheduling", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the peak activation memory per GPU under GPipe versus 1F1B, and why does 1F1B win on memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3758", "title": "Interleaved Pipeline Stages: Halving the Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does an interleaved pipeline schedule reduce the pipeline bubble compared to standard 1F1B, and what is the communication cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3759", "title": "Pipeline Parallelism vs Tensor Parallelism: When to Choose Which", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why should tensor parallelism stay within each MI300X node rather than crossing the 400 Gb/s InfiniBand links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3760", "title": "Micro-Batch Size Selection for Pipeline Efficiency", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the minimum number of micro-batches to keep the PP=4 bubble under 10%, what micro-batch size follows, and what is the activation memory tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3761", "title": "Pipeline Stalls from Unbalanced Stage Partitioning", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the throughput impact of stage 0 being 1.8× slower, and how would you rebalance the pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3762", "title": "Pipeline Parallelism Activation Checkpointing Interaction", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With selective activation checkpointing, how much activation memory do you save per stage and what recomputation cost should you expect?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 2}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3763", "title": "Pipeline Bubble in Zero-Bubble Scheduling", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does zero-bubble pipeline scheduling work, what memory cost does it add, and does it truly achieve zero bubble?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 4}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3764", "title": "Pipeline Drain Latency in Inference Serving", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the time-to-first-token (TTFT) latency penalty from the pipeline, and how does continuous batching interact with pipeline parallelism for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3765", "title": "mmap for Zero-Copy Model Loading", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the cold-start times and memory implications for malloc+read versus mmap, and when does each loading strategy win?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3767", "title": "mmap vs Safetensors: Cold Start Optimization for Model Serving", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For sub-3-second cold starts across 50 safetensors 7B models, how do NVMe read, mmap, and host-RAM caching compare?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3769", "title": "Shared mmap for Multi-Tenant GPU Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a memory-efficient architecture using mmap to share the 26 GB model across 8 workers, and how would you manage the GPU memory?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3770", "title": "mmap and NUMA: The Hidden Latency Trap", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is cudaMemcpy to GPU 1 60% slower after mmap, and how would you design NUMA-aware multi-GPU model loading?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 3}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3771", "title": "Least-Connections Routing for Heterogeneous GPU Serving", "topic": "load-balancing", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design a load-balancing strategy that accounts for heterogeneous hardware capacities?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3772", "title": "Consistent Hashing for Model-Aware Request Routing", "topic": "load-balancing", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does consistent hashing reduce model remapping when a GPU fails, and how would you mitigate the 3-second cold-start penalty?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3774", "title": "FlashAttention Tiling: Why SRAM Size Determines Tile Shape", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which FlashAttention tile sizes fit in 256 KB SRAM for N=4096, d=128, and why does tiling reduce HBM traffic from O(N²) to O(N)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3775", "title": "FlashAttention-2 vs Standard Attention: Wall-Clock Speedup", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using IO-complexity reduction, what FlashAttention-2 speedup should you expect over the 85 ms standard attention forward pass?", "chain_ids": ["cloud-chain-auto-014-05"], "chain_positions": {"cloud-chain-auto-014-05": 0}, "chain_tiers": {"cloud-chain-auto-014-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3776", "title": "PagedAttention: Virtual Memory for KV Cache", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does PagedAttention improve KV-cache utilization, and what throughput gain should you expect?", "chain_ids": ["cloud-chain-auto-014-04"], "chain_positions": {"cloud-chain-auto-014-04": 0}, "chain_tiers": {"cloud-chain-auto-014-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3777", "title": "Ring Attention for Million-Token Contexts", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does Ring Attention distribute 1M-token attention across 8 MI300X GPUs, and what communication pattern does it use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3778", "title": "Prefix Caching: Amortizing System Prompts", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much GPU time is wasted recomputing the 2000-token system prompt, and what annual savings does prefix caching provide at $2 per GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3779", "title": "FlashAttention Backward Pass: The Recomputation Trade", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does FlashAttention handle the missing attention matrix during backward, and what extra compute cost does recomputation add?", "chain_ids": ["cloud-chain-auto-014-03"], "chain_positions": {"cloud-chain-auto-014-03": 1}, "chain_tiers": {"cloud-chain-auto-014-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3780", "title": "FlashAttention on TPU: XLA Attention vs Pallas Kernels", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What architectural differences matter for FlashAttention-style kernels on TPU v5e versus GPUs, and is a manual JAX Pallas kernel worth it?", "chain_ids": ["cloud-chain-auto-014-08"], "chain_positions": {"cloud-chain-auto-014-08": 1}, "chain_tiers": {"cloud-chain-auto-014-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3781", "title": "IO-Awareness: Roofline Model for Attention Kernels", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is standard attention memory-bound on an H100, what arithmetic intensity is needed to be compute-bound, and what is standard attention's AI?", "chain_ids": ["cloud-chain-auto-014-02"], "chain_positions": {"cloud-chain-auto-014-02": 0}, "chain_tiers": {"cloud-chain-auto-014-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3782", "title": "FlashDecoding: Parallelizing Attention During Inference", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does FlashDecoding parallelize a single-token decode over KV blocks for a 32K context, and what throughput improvement does it provide?", "chain_ids": ["cloud-chain-auto-014-07"], "chain_positions": {"cloud-chain-auto-014-07": 0}, "chain_tiers": {"cloud-chain-auto-014-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3783", "title": "Multi-Query vs Grouped-Query Attention Memory Savings", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache memory does 4096-token context require for MHA, GQA, and MQA in the 70B model, and how does this affect FlashAttention-2 tiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3784", "title": "FlashAttention Memory Savings Enable Longer Training Contexts", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With only 2 GiB left for activations, what max sequence length can standard attention support, and how does FlashAttention change it?", "chain_ids": ["cloud-chain-auto-014-03"], "chain_positions": {"cloud-chain-auto-014-03": 0}, "chain_tiers": {"cloud-chain-auto-014-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3785", "title": "Causal Masking in FlashAttention: Skipping Unnecessary Tiles", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does FlashAttention-2 exploit the causal lower-triangular mask when tiling, and what FLOP savings does it get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3786", "title": "FlashAttention on MI300X: HBM3 Bandwidth Advantage", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For compute-bound FlashAttention-2, how does the comparative performance shift, and which architecture benefits more relative to its standard attention baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3787", "title": "Sliding Window Attention with FlashAttention Tiling", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does W=4096 sliding-window attention interact with FlashAttention tiling for a 32K context, and what memory and compute savings result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3788", "title": "FlashAttention-3: Asynchronous Tiling on Hopper", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do TMA and warp specialization let FlashAttention-3 outperform FlashAttention-2, and what peak TFLOPS percentage does it reach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3789", "title": "Chunked Prefill: Balancing TTFT and Decode Throughput", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does chunked prefill with FlashAttention prevent 8192-token prefills from stalling decode, and what chunk size should you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3790", "title": "Online Softmax: The Numerical Foundation of FlashAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How are the running max and running sum maintained across tiles, and why is this critical for correctness?", "chain_ids": ["cloud-chain-auto-014-06"], "chain_positions": {"cloud-chain-auto-014-06": 0}, "chain_tiers": {"cloud-chain-auto-014-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3791", "title": "FlashAttention Kernel Fusion Benefits", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does FlashAttention kernel fusion reduce overhead versus five PyTorch attention kernels, and what two overhead types are eliminated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3792", "title": "Attention Sink Tokens and KV Cache Eviction", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the attention sink phenomenon, and what KV-cache eviction policy preserves quality for 128K-context streaming on an 80 GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3793", "title": "FlashAttention with Variable-Length Sequences in a Batch", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For lengths [128, 256, 512, 1024, 2048, 4096, 8192, 16384], what is the correct total unpadded attention work ΣN²?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3794", "title": "FlashAttention-2 Warp Partitioning Strategy", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What synchronization overhead does FlashAttention-2 eliminate by partitioning across the Q sequence dimension, and what is the correct latency-savings range?", "chain_ids": ["cloud-chain-auto-014-02"], "chain_positions": {"cloud-chain-auto-014-02": 3}, "chain_tiers": {"cloud-chain-auto-014-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3795", "title": "Attention with Linear Complexity: When FlashAttention Isn't Enough", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "On an MI300X with a 50 ms serving budget, at what sequence length does FlashAttention become attention-compute-bound versus memory-bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3797", "title": "FlashAttention for Cross-Attention in Multimodal Models", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is this cross-attention operation memory-bound or compute-bound with standard attention, and how does FlashAttention-2 change this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3798", "title": "MoE Routing: Top-K Gating and Load Imbalance", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 256 tokens, 8 experts, and top-2 routing, how many tokens should each expert process uniformly, and what is the impact of the observed imbalance?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 0}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3799", "title": "MoE Memory Footprint: Why Sparse Models Need More VRAM", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does Mixtral 8x7B's VRAM requirement compare with a dense 13B model, and why can't you load only the active 13B parameters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3800", "title": "Expert Parallelism: All-to-All Communication Pattern", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the MoE All-to-All communication volume, and why is All-to-All the expert-parallel collective instead of AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3801", "title": "Capacity Factor Tuning: Quality vs Throughput", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you reduce the MoE capacity factor from 1.5 to 1.0, and what are the utilization-versus-token-drop tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3802", "title": "Auxiliary Load Balancing Loss: Mechanism and Coefficient", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What fraction f_i does each of the 5 nearly dormant experts receive if they uniformly share the remaining 20% of routed tokens?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 1}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3803", "title": "MoE Inference Latency: The Expert Loading Problem", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using 3.35 TB/s HBM3 bandwidth, what is the dense 13B INT4 weight-read lower bound for batch-1 decode latency?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 2}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3804", "title": "Expert Parallelism + Tensor Parallelism: Hybrid MoE Training", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you combine expert parallelism, tensor parallelism, and data parallelism for the 1.6T-parameter MoE on 256 MI300X GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3805", "title": "Token Dropping in MoE: Quality Impact at Scale", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With an 8% token drop rate under top-2 routing, is the perplexity gap primarily due to token dropping, and what fraction of tokens lose an expert assignment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3806", "title": "MoE FLOP Efficiency: Why Sparse Models Train Faster", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many training FLOP does the 1.6T MoE use relative to the dense 175B model, and what FLOP savings should you expect?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 1}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3807", "title": "Sparse Gating: Softmax vs Sigmoid Router Design", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the implications of softmax versus sigmoid gating for load balancing, gradient flow, and expert specialization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3808", "title": "Megablocks: Variable-Length Expert Computation", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does block-sparse GEMM eliminate the capacity factor entirely, and what GPU hardware features does it exploit? With batch size 512, 8 experts, and capacity factor 1.5, what are the per-expert capacity and total padded token count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3809", "title": "Expert Choice Routing: Inverting the Selection", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do Token Choice and Expert Choice routing compare on load balancing, token drops, and adaptive expert computation per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3810", "title": "MoE Batch Size Scaling: Sparsity Advantage Vanishes", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does MoE's latency scale worse with batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3811", "title": "Fine-Grained MoE: More Experts, Smaller Each", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At batch size 256, how many distinct experts should you expect to activate out of the 160?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3812", "title": "MoE on TPU: Pipelining All-to-All with ICI", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How long does a 4 MB All-to-All transfer take over a 50 GB/s InfiniBand link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3813", "title": "Expert Offloading: CPU-GPU Expert Swapping for Inference", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is CPU expert swapping over PCIe 4.0 viable for serving Mixtral 8x7B under a 50 ms/token interactive latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3814", "title": "Shared vs Routed Experts: DeepSeek's Hybrid Architecture", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why include 2 shared experts alongside 160 routed experts, and what systems benefits does this hybrid MoE design provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3815", "title": "MoE Training Instability: Expert Collapse Diagnosis", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused this expert collapse, and how do you recover the training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3816", "title": "MoE Gradient Computation: The Router Gradient Challenge", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What gradient bias does the straight-through top-k estimator introduce, and what alternative routing-gradient approaches exist?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 3}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3817", "title": "MoE Serving with Expert Caching", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a GPU expert cache for a 64-expert top-2 MoE when most conversations activate only 12-15 experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3818", "title": "MoE vs Dense: Cost-per-Token Comparison for Training", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the dense 70B and MoE 8x70B options compare in total GPU-hours and cost at $2/GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3819", "title": "MoE Expert Pruning for Inference Optimization", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can you prune the 4 low-traffic experts for inference, and what memory savings and quality impact should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3820", "title": "MoE with Tensor Parallelism: Splitting Experts Across GPUs", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you lay out tensor parallelism within each 25B-parameter expert and expert parallelism across 64 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3821", "title": "MoE Quantization: Expert-Level vs Layer-Level Calibration", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong with the uniform GPTQ calibration, and how do you fix the severe quality degradation in specialized experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3822", "title": "MoE Architecture: Router Network Design and Placement", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Which layers are replaced with MoE layers, and how does the router network interact with the expert MLPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3829", "title": "Shadow Deployment PCIe Bottleneck", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling 100% shadow traffic degrade performance despite low H100 compute utilization and free HBM bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3830", "title": "H100 Batch Size 1 Compute Collapse", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does H100 compute utilization collapse below 1% for batch-1 inference with an arithmetic intensity of 1 FLOP/Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3831", "title": "Analyzing Training Time Overhead of Gradient Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does full gradient checkpointing add exactly 100 ms to the training step, and what compute-memory tradeoff causes it?", "chain_ids": ["cloud-chain-auto-027-24"], "chain_positions": {"cloud-chain-auto-027-24": 0}, "chain_tiers": {"cloud-chain-auto-027-24": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3832", "title": "LLM Inference Utilization at Batch Size 1", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does compute utilization collapse at batch size 1, and why is peak-TFLOPS cost estimation flawed for this workload?", "chain_ids": ["cloud-chain-auto-secondary-015-07"], "chain_positions": {"cloud-chain-auto-secondary-015-07": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3833", "title": "PFC Congestion Spreading and Cluster Throughput Collapse", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a localized receiver bottleneck during RoCEv2 All-to-All cause cluster-wide throughput collapse under PFC?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 0}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3834", "title": "Kubernetes Pod Affinity and GPU Bandwidth", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did default Kubernetes pod spreading bottleneck training, and how did co-locating the 4 pods on one NVLink node fix it?", "chain_ids": ["cloud-chain-auto-021-12"], "chain_positions": {"cloud-chain-auto-021-12": 0}, "chain_tiers": {"cloud-chain-auto-021-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3835", "title": "Optimal Checkpoint Interval Scaling Tradeoffs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does doubling the GPU count reduce the optimal checkpoint interval when checkpoint write time is unchanged?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3836", "title": "Explaining Stale Online Features with Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are recommendations stale despite 10ms inference and 20% GPU utilization, and where is the true bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3837", "title": "FlashAttention Arithmetic Intensity Shift", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does standard attention underutilize the A100's compute capacity, and how does FlashAttention's SRAM tiling shift the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3838", "title": "Uncoalesced Memory Access in Gather Kernel", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the strided gather saturate 2.0 TB/s HBM bandwidth but deliver only about 125 GB/s of useful data?", "chain_ids": ["cloud-chain-auto-secondary-010-21"], "chain_positions": {"cloud-chain-auto-secondary-010-21": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3839", "title": "Throughput Collapse in Fallback Model Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does switching from a batched 30B model to a batch-1 7B fallback reduce total tokens/second instead of shedding load?", "chain_ids": ["cloud-chain-auto-secondary-015-16"], "chain_positions": {"cloud-chain-auto-secondary-015-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3840", "title": "Ring-AllReduce Cross-Node Bottleneck", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does Ring-AllReduce jump from 0.155 s on 8 GPUs to 3.0 s on 16 GPUs across two nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3841", "title": "Compiler Heuristics for MatMul Fusion", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why would the compiler avoid fusing the compute-bound MatMul with GeLU despite the usual memory-bandwidth benefit of fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3842", "title": "Analyzing Memory Bounds in Unfused Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are separate GeLU, Dropout, and Scale kernels memory-bound, and how does fusing them change HBM traffic?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3843", "title": "Offline Distillation PCIe Bandwidth Bottleneck", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does offline distillation with full-vocabulary teacher logits collapse A100 utilization despite a small 100M student model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3844", "title": "KV Cache Pre-allocation OOM on A100 GPU", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the service OOM on the 25th request when each active request uses only 100 tokens of context, and the maximum KV cache for 4096 tokens is exactly 2 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3845", "title": "H100 Vector Addition Utilization Collapse", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP16 vector addition on a GPU show less than 0.1% compute utilization despite the GPU's 989 TFLOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3847", "title": "Analyzing FP16 Speedup on A100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does switching this [16, 4096] x [4096, 4096] layer from FP32 to FP16 produce exactly a 2x speedup on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3848", "title": "MoE Capacity Factor Communication Overhead", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does raising the MoE capacity factor from 1.0 to 1.5 increase All-To-All latency by 50% even though valid routed tokens are unchanged?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 0}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3849", "title": "Analyzing Low Compute Utilization in Memory-Bound Kernels", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is compute utilization capped near 13% when the layer's arithmetic intensity is 20 FLOPs/Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3850", "title": "H100 Throughput Drop at Constant Power Limit", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the clock speed and GEMM throughput drop after 2 minutes even though power draw remains at the 700W limit?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3851", "title": "OOM During Optimizer Initialization on A100", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does AdamW initialization OOM on an 80 GB A100 after loading only 14 GB of FP16 weights for a 7B model?", "chain_ids": ["cloud-chain-auto-008-16"], "chain_positions": {"cloud-chain-auto-008-16": 1}, "chain_tiers": {"cloud-chain-auto-008-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3852", "title": "Multi-Tier LLM Fallback and Load Shedding", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design request routing, degradation, and load shedding to survive a 10,000 QPS spike while prioritizing premium users?", "chain_ids": ["cloud-chain-auto-secondary-015-16"], "chain_positions": {"cloud-chain-auto-secondary-015-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3853", "title": "Architecting an Online Distillation Pipeline for a 70B Model", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect the 70B-to-7B distillation pipeline to avoid logit I/O bottlenecks and keep student training busy?", "chain_ids": ["cloud-chain-auto-secondary-015-29"], "chain_positions": {"cloud-chain-auto-secondary-015-29": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-29": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3854", "title": "KV-Cache Aware LLM Routing", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route variable-length LLM requests to maximize KV-cache hits and utilization while avoiding hot spots and OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3855", "title": "Architecting Mixed-Precision Training for a 100B LLM", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What mixed-precision formats would you use for weights, activations, gradients, and optimizer states when training the 100B LLM, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-02"], "chain_positions": {"cloud-chain-auto-secondary-015-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3856", "title": "Distributed Serving Architecture for 70B LLM", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you parallelize and allocate memory for serving the 70B FP16 chat model on each 8x H100 node to support high concurrency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3857", "title": "MoE Network Placement and Parallelism Strategy", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you place TP, PP, DP, and EP for the 1.2T MoE model to keep token all-to-all off the slow InfiniBand fabric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3858", "title": "Architecting RDMA Transport for H100 Distributed Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you architect the RDMA transport, memory registration, and buffering for the 128-node ring all-reduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3859", "title": "Real-time LLM Safety Guardrail Architecture", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you co-locate the three 1B safety classifiers on H100s or deploy them on A100 microservices to meet a 40ms P99 latency budget?", "chain_ids": ["cloud-chain-auto-secondary-015-33"], "chain_positions": {"cloud-chain-auto-secondary-015-33": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3860", "title": "Multimodal Sharded Storage Architecture", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you architect the storage format and data loading to avoid I/O bottlenecks and minimize object storage costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3861", "title": "Rack-Level Thermal Architecture for Dense H100 Clusters", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you schedule and power-cap the 5-node H100 racks to maximize throughput without exceeding the 30 kW cooling limit?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 3}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3862", "title": "Pod Placement Bandwidth Bottleneck", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What Kubernetes scheduling behavior explains a 4 GB activation transfer taking 80 ms between the two 4-GPU Pods?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3863", "title": "Synthetic Data PCIe Bottleneck in 3D Imaging", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the H100 reach only about 22% MFU when synthetic 3D batches are generated with zero CPU latency?", "chain_ids": ["cloud-chain-auto-secondary-015-41"], "chain_positions": {"cloud-chain-auto-secondary-015-41": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3864", "title": "Diagnosing Low Host-to-Device Bandwidth", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 4 GB host-to-device batch transfer take 250 ms over PCIe Gen5 instead of the theoretical minimum, and how can it be fixed?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3865", "title": "Diagnosing Low Decoder MFU on A100", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the decoder phase fall to roughly 0.6% MFU at batch size 1 while the encoder achieves 45% MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3866", "title": "Diagnosing High Inference Latency on A100", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is average end-to-end latency 200 ms when the execution time is 40 ms and arrival rate is 20 requests per second?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 0}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3867", "title": "Toxicity Classifier Bandwidth Saturation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling a 5 TFLOPS guardrail classifier with 0.5 FLOPs/Byte intensity collapse LLM throughput on the GPU?", "chain_ids": ["cloud-chain-auto-secondary-015-33"], "chain_positions": {"cloud-chain-auto-secondary-015-33": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3868", "title": "K8s Network Architecture for PyTorch DDP", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Kubernetes networking option should you choose for DDP, Multus/SR-IOV RDMA or Calico overlay, and why?", "chain_ids": ["cloud-chain-auto-021-12"], "chain_positions": {"cloud-chain-auto-021-12": 1}, "chain_tiers": {"cloud-chain-auto-021-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3869", "title": "CPU Offload vs Gradient Accumulation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture provides higher training throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3870", "title": "FP16 Dual A100 vs FP8 Single H100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which architecture yields higher generation throughput, and what precision-accuracy tradeoffs must be managed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3871", "title": "Evaluating Upgrades for Memory-Bound Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which optimization meets the 50ms SLA: upgrading to an H100 or applying INT8 weight-only quantization on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3872", "title": "Scaling Up vs Out Queue Dynamics", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Assuming Poisson arrivals and exponentially distributed service times (modeling as independent M/M/1 queues per GPU), which option should you choose and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3873", "title": "Evaluating RDMA Kernel Bypass for Distributed Clusters", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which transport design meets the 600 ms budget for sending a 16 GB tensor, TCP/IPoIB staging or GPUDirect RDMA?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 1}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3874", "title": "Disaggregated Prefill vs Hedged Requests for TTFT", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architecture is better for guaranteeing the SLA for a prompt that produces a 2 GB KV cache?", "chain_ids": ["cloud-chain-auto-025-17"], "chain_positions": {"cloud-chain-auto-025-17": 1}, "chain_tiers": {"cloud-chain-auto-025-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3875", "title": "LLM Training FLOPs and A100 Time Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many FLOPs are needed to pre-train the 1B model on 52B tokens, and what is the theoretical A100 training time?", "chain_ids": ["cloud-chain-auto-secondary-015-06"], "chain_positions": {"cloud-chain-auto-secondary-015-06": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3876", "title": "Rack Power Budgeting for H100 vs A100 Servers", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the H100 and A100 TDPs, and how many 8-GPU servers of each fit in a 15 kW rack with 2 kW host power?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 0}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3877", "title": "PCIe Gen5 Transfer Time and Pinned Memory", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How long should a 16 GB host-to-GPU transfer take over PCIe Gen5 x16, and why does pinned memory matter?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3878", "title": "VRAM Calculation for FP16 Guardrail Model", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM do the FP16 weights of the 7B guardrail require, and what percentage of an 80 GB A100 is that?", "chain_ids": ["cloud-chain-auto-secondary-015-33"], "chain_positions": {"cloud-chain-auto-secondary-015-33": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3879", "title": "Rack Cooling Limits for 8-GPU H100 vs A100 Nodes", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the thermal output of 8x H100 versus 8x A100 GPUs, and how many full 8x H100 servers can a 15 kW rack cool?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3880", "title": "Calculate Model State Memory per GPU", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much per-GPU model-state memory is required for the 64B model with TP=8, PP=4, DP=16, and no ZeRO?", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 1}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3881", "title": "KV Cache Footprint: Decoder vs Encoder-Decoder", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact KV-cache memory footprint after a 1024-token prompt (batch 32) for the decoder-only and encoder-decoder models?", "chain_ids": ["cloud-chain-auto-secondary-016-22"], "chain_positions": {"cloud-chain-auto-secondary-016-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3882", "title": "Compute Equal Opportunity Difference and Memory Read Latency", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Equal Opportunity Difference for Groups A and B, and what is the minimum HBM read time for the 10 GB dataset?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3883", "title": "SRAM Calculation for FlashAttention Tiling", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given a head dimension d=128, a query block size B_r=64, and key/value block sizes B_c=128, how much SRAM in bytes is needed for the FP16 Q, K, V, and O tiles, and does it fit within the 192 KB limit?", "chain_ids": ["cloud-chain-auto-014-01"], "chain_positions": {"cloud-chain-auto-014-01": 0}, "chain_tiers": {"cloud-chain-auto-014-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3884", "title": "Calculate Logit Memory in LLM Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many GiB are required to store both teacher and student FP32 logits for the given batch, sequence length, and vocabulary size?", "chain_ids": ["cloud-chain-auto-secondary-015-29"], "chain_positions": {"cloud-chain-auto-secondary-015-29": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3885", "title": "Calculate KV-Cache Memory for 7B Model Inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV-cache memory in GB is required for a batch size of 16 at 1024 tokens for a 7B FP16 Transformer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3886", "title": "Calculate Mixed-Precision Memory Footprint for Adam Training", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much memory is required for the 7B model's FP16 weights, gradients, FP32 master weights, and Adam states, and will it fit on an 80 GB A100?", "chain_ids": ["cloud-chain-auto-secondary-015-01"], "chain_positions": {"cloud-chain-auto-secondary-015-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3887", "title": "Calculate Expert Token Capacity in Top-1 MoE Routing", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expert capacity per expert for batch 128, sequence length 2048, 8 experts, and capacity factor 1.25?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3888", "title": "Calculate Tensor Parallel Communication Overhead", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical communication time in milliseconds for the MLP forward All-Reduce across the 2 GPUs?", "chain_ids": ["cloud-chain-auto-022-07"], "chain_positions": {"cloud-chain-auto-022-07": 0}, "chain_tiers": {"cloud-chain-auto-022-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3889", "title": "Calculate Pipeline Bubble Overhead for 1F1B Schedule", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the total 1F1B pipeline step time and exact bubble overhead percentage for p=8 and m=32?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3890", "title": "Parquet Storage and Pipeline Bottleneck Calculation", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much Parquet storage (in GB, base-10) is required, and how long will it take to load and decompress the full 50B-row dataset?", "chain_ids": ["cloud-chain-auto-secondary-007-02"], "chain_positions": {"cloud-chain-auto-secondary-007-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3891", "title": "Coreset Selection Pipeline for LLM Pre-training at Scale", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the 15T-to-3T coreset scoring pipeline, identify its bottlenecks, and estimate scoring plus training time?", "chain_ids": ["cloud-chain-auto-secondary-015-41"], "chain_positions": {"cloud-chain-auto-secondary-015-41": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3892", "title": "H100 Cluster Power and Cooling Architecture", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which cooling architecture maximizes compute under the 25 MW facility cap, and how many nodes and racks can air vs D2C liquid support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3893", "title": "VLM-Based Automated Curation at 100B Scale", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What distributed VLM scoring architecture would you use, what is the bottleneck, and how many A100 GPUs are needed to finish in 14 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3894", "title": "CUDA-to-HIP Translation Overhead for a Custom GEMM Kernel", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How would you use hipify-perl to port the 15K CUDA lines, and what manual work and timeline should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3895", "title": "ONNX Runtime Execution Provider Selection for Multi-Accelerator Inference", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many FLOPs are required for a ResNet-50 batch of 32, and why is the 2.1 TFLOP estimate wrong?", "chain_ids": ["cloud-chain-auto-secondary-005-02"], "chain_positions": {"cloud-chain-auto-secondary-005-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3896", "title": "Triton Kernel Portability from NVIDIA to AMD GPUs", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "At 55% of MI300X FP16 peak, what effective throughput is the Triton attention kernel achieving?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3897", "title": "Vendor Lock-in TCO Analysis: CUDA Moat vs Multi-Backend Investment", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you invest $400K in a multi-vendor port or accept CUDA lock-in, and what is the break-even period assuming a 50/50 workload split?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3898", "title": "SYCL Portability Layer Performance on Intel Max Series vs H100", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If SYCL loses 69 TFLOPS relative to a 910 TFLOPS native baseline, what is the performance tax percentage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3899", "title": "ONNX Graph Partitioning Across Mixed Execution Providers", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you reduce the 40% ONNX Runtime latency hit from 50 CUDA EP fallback ops among 340 operators?", "chain_ids": ["cloud-chain-auto-secondary-005-02"], "chain_positions": {"cloud-chain-auto-secondary-005-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3900", "title": "Warp Size Divergence When Porting CUDA Kernels to AMD CDNA", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the HIP reduction kernel 40% slower on the AMD accelerator, and how should the shuffle reduction be fixed for 64-wide wavefronts?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3901", "title": "TPU v5e XLA Compilation Constraints for Portable Model Code", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you refactor the PyTorch model so torch.compile can run it on both GPU Inductor and TPU XLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3902", "title": "Designing a Hardware-Agnostic Kernel Dispatch Layer", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What kernel dispatch architecture would let one inference framework run on CUDA, ROCm, and XLA with under 10% overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3904", "title": "ONNX Model Compatibility Matrix Across Runtime Versions", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How should you handle an ONNX opset 18 GroupNormalization model when production ONNX Runtime 1.14 only supports opset 17?", "chain_ids": ["cloud-chain-auto-secondary-005-02"], "chain_positions": {"cloud-chain-auto-secondary-005-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3905", "title": "Triton's IR Compilation Pipeline for Multi-Backend Targeting", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does Triton's compilation pipeline enable portability across NVIDIA PTX and AMDGPU, and where are backend-specific choices made?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3906", "title": "ROCm Library Parity Gap for Production ML Workloads", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which components have ROCm equivalents at performance parity, and which do not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3907", "title": "Cross-Platform Model Numerics Divergence Debugging", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you determine whether the 0.003 FP16 logit difference and 0.2% quality drop are a bug or numerical noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3908", "title": "Portable Quantization Formats Across Accelerator Backends", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How is W4A16 executed on modern GPUs, and why don't you need native 4-bit matrix multiplication hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3909", "title": "CI/CD Pipeline Design for Multi-Accelerator Kernel Testing", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design CI/CD to test 80 custom kernels across H100, MI300X, and TPU v5e without dedicated hardware in CI?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 3}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3910", "title": "HIP Kernel Launch Parameter Translation from CUDA", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you translate the CUDA launch to HIP on MI300X, and what occupancy issue does the 48 KB LDS allocation create?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3911", "title": "MLIR as a Universal Compiler IR for ML Portability", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many engineers are required to support the 3 new hardware targets using MLIR, assuming 2 engineers maintain shared MLIR infrastructure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3912", "title": "TPU v5e SPMD Programming Model vs CUDA's SIMT", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the key conceptual shifts when porting a CUDA SIMT sparse attention kernel to a TPU v5e SPMD model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3913", "title": "Evaluating torch.compile Backend Portability Across Accelerators", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the portability constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3914", "title": "Multi-Backend Distributed Training: Network Bandwidth Trap", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you architect multi-vendor distributed training, and why does a 14 GB transfer take 124 ms instead of 15 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3915", "title": "ONNX Operator Coverage Gap Analysis for Transformer Models", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you fix the three GPT-2 ONNX runtime failures on DirectML?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3916", "title": "Memory Layout Portability: NCHW vs NHWC Across Accelerators", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do NCHW layouts cause a 20-30% drop on TPU v5e and MI300X, and how do you fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3917", "title": "Benchmarking Methodology for Cross-Platform Performance Claims", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What methodology do you apply to evaluate the vendor's cross-platform performance claims?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3918", "title": "Migrating cuDNN-Fused Operators to Portable Alternatives", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you recover the performance of a cuDNN-specific 3-way fusion on MI300X when MIOpen lacks an equivalent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3919", "title": "Portable Mixed-Precision Training Across GPU Architectures", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How do you make mixed-precision training portable across hardware architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3920", "title": "Strategic Platform Risk Assessment for ML Infrastructure", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What framework do you use to assess and mitigate the risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3921", "title": "Driver Version Compatibility Hell in Multi-GPU Cloud Deployments", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the container crash with driver 525 and CUDA runtime 12.1, and how should you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3922", "title": "Portable Profiling: NCU vs rocProf vs XLA Profiler Cross-Platform Analysis", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you create comparable performance analysis across these disparate hardware platforms and profilers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3923", "title": "Compiler Fusion Differences Across XLA, TorchInductor, and MIGraphX", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do TorchInductor, MIGraphX, and XLA produce different fusion granularities for the same 12-layer transformer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3924", "title": "Overlapping Backward Pass with AllReduce in Data-Parallel Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why does DDP start AllReduce before the entire backward pass finishes?", "chain_ids": ["cloud-chain-auto-022-03"], "chain_positions": {"cloud-chain-auto-022-03": 0}, "chain_tiers": {"cloud-chain-auto-022-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3925", "title": "Gradient Bucket Size Tuning for Optimal Communication Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you tune DDP gradient bucket size to reduce the 40% communication overhead for the 1.3B model?", "chain_ids": ["cloud-chain-auto-022-03"], "chain_positions": {"cloud-chain-auto-022-03": 1}, "chain_tiers": {"cloud-chain-auto-022-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3926", "title": "NCCL Async Operations and CUDA Stream Orchestration for Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If 4GB of gradients are sent over a 50 GB/s link, what is the serial communication time before any overlap?", "chain_ids": ["cloud-chain-auto-022-04"], "chain_positions": {"cloud-chain-auto-022-04": 2}, "chain_tiers": {"cloud-chain-auto-022-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3927", "title": "1F1B vs GPipe — What 1F1B Actually Saves", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 4 stages and 8 micro-batches, what is the bubble fraction under GPipe versus 1F1B, and what does 1F1B actually save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3928", "title": "Double Buffering for Data Loading Overlap with GPU Compute", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How would you use double buffering to hide the 20 ms CPU-to-GPU transfer behind the 15 ms GPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3929", "title": "Prefetching Activations in Pipeline Parallelism Across Nodes", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design an activation prefetching scheme to overlap 256MB activation transfers over 400 Gbps InfiniBand with compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3930", "title": "Overlap Efficiency Degradation at Scale: 8 GPUs vs 256 GPUs", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does AllReduce overlap efficiency fall from 92% at 8 H100s to 68% at 256 H100s, and how would you fix it?", "chain_ids": ["cloud-chain-auto-022-03"], "chain_positions": {"cloud-chain-auto-022-03": 2}, "chain_tiers": {"cloud-chain-auto-022-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3931", "title": "Tensor Parallelism AllReduce Overlap with Feedforward Computation", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can standard tensor parallelism AllReduces be overlapped with computation, and what techniques partially address this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3932", "title": "ZeRO Stage 3 Communication-Computation Overlap Strategy", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you overlap ZeRO-3 parameter AllGathers with forward and backward computation across 64 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3933", "title": "CUDA Stream Priority for Communication vs Computation Scheduling", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you prioritize CUDA streams to prevent communication from blocking computation?", "chain_ids": ["cloud-chain-auto-022-04"], "chain_positions": {"cloud-chain-auto-022-04": 1}, "chain_tiers": {"cloud-chain-auto-022-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3936", "title": "Overlapping AllReduce with Optimizer Step via Gradient Sharding", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can we overlap the optimizer step with ongoing AllReduce by processing shards as their gradients are reduced?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3937", "title": "FSDP Prefetch Policy Tuning for Communication-Compute Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which FSDP prefetch policy should you use when BACKWARD_PRE OOMs but NO_PREFETCH is 40% slower, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3938", "title": "Overlapping KV Cache Transfer with Decode Computation in Serving", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you overlap transferring a 2GB KV cache over NVLink with decode on the destination GPU?", "chain_ids": ["cloud-chain-auto-022-05"], "chain_positions": {"cloud-chain-auto-022-05": 0}, "chain_tiers": {"cloud-chain-auto-022-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3939", "title": "MI300X Infinity Fabric vs H100 NVLink for Communication Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do H100 NVSwitch and MI300X Infinity Fabric topology differences affect overlap for 70B LLM training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3940", "title": "Diagnosing Failed Overlap: NCCL Blocking on Compute Stream", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are the NCCL AllReduce kernels failing to overlap with the backward computation, running sequentially instead?", "chain_ids": ["cloud-chain-auto-022-04"], "chain_positions": {"cloud-chain-auto-022-04": 0}, "chain_tiers": {"cloud-chain-auto-022-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3941", "title": "Expert Parallelism All-to-All Overlap in Mixture-of-Experts Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you overlap the 8 ms All-to-All routing with 6 ms expert compute across 32 MoE layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3942", "title": "PCIe DMA Engine Overlap for Host-Device Data Movement", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does H100's separate DMA copy engine let PCIe transfers overlap with kernel execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3943", "title": "Quantifying Overlap Efficiency with Nsight Systems Timeline Analysis", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the overlap efficiency and exposed communication time for backward [180,380] ms and NCCL [200,350] ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3944", "title": "Micro-Batch Size Impact on Pipeline Parallelism Bubble Ratio", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you choose the micro-batch count for an 8-stage pipeline with global batch 1024, balancing bubble ratio and GPU efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3945", "title": "Overlapping Embedding Table AllReduce in Hybrid Parallelism", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can the dense MLP AllReduce and the sparse embedding AlltoAll overlap, and what is the latency impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3946", "title": "TPU v5e ICI Ring Topology and Communication Overlap Opportunities", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a 2D torus ICI topology change communication-computation overlap strategy for data-parallel training versus NVSwitch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3947", "title": "Activation Recomputation vs Activation Stashing Trade-Off for Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does this choice affect communication overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3948", "title": "Ring AllReduce Latency Model and Bandwidth Saturation Analysis", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the time complexity of ring AllReduce for N GPUs and M bytes, and why does it help communication overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3949", "title": "Overlapping Gradient Accumulation Steps with Next Batch Loading", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you prefetch data during K=4 gradient accumulation so micro-steps do not stall waiting for the next batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3950", "title": "Async Checkpointing Overlap with Training Computation", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you make 70B model checkpoints asynchronous so the 45-second save every 100 steps does not pause training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3951", "title": "Sequence Parallelism AllGather Overlap in Megatron-LM", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Megatron-LM overlap the sequence-parallel AllGather before attention/MLP blocks with computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3952", "title": "Compute-Communication Overlap in Distributed Inference with Speculative Decoding", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you overlap the draft model's K-token generation with the 4-GPU target model's previous verification?", "chain_ids": ["cloud-chain-auto-022-05"], "chain_positions": {"cloud-chain-auto-022-05": 2}, "chain_tiers": {"cloud-chain-auto-022-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3953", "title": "NCCL Graph Capture for Reducing Communication Launch Overhead", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What benefit does CUDA Graph capture provide for 200 small NCCL AllReduce calls, and what limitations must you handle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3954", "title": "Overlapping Gradient Norm Computation with AllReduce", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can you overlap the global gradient norm computation with the gradient AllReduce to eliminate its latency overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3955", "title": "Communication Overlap in Context-Parallel Long-Sequence Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you overlap KV exchange with attention compute for 128K-context training using context parallelism across 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3956", "title": "Multi-Level Overlap: Data Loading, Compute, and Communication Pipeline", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you pipeline data loading, forward/backward compute, and AllReduce so the 240 ms step approaches 200 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3957", "title": "Weight Update Sharding with Overlapped AllGather in FSDP2", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What overlap opportunities exist in FSDP2's ReduceScatter, local optimizer, and AllGather pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3958", "title": "Network Congestion Detection During Overlapped Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 64-GPU step time jump when AllReduce latency fluctuates from 30ms to 200ms, and how would you fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3959", "title": "Overlapping Parameter Server Pull with Forward Computation", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can this parameter server pull be overlapped with the forward pass of non-embedding layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3960", "title": "GradScaler Synchronization as a Hidden Overlap Barrier", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3961", "title": "Software Pipelining in Triton Kernels for Memory-Compute Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does Triton's num_stages software pipelining improve your H100 GEMM, and what value would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3963", "title": "Prefill-Decode Disaggregation Overlap in Production LLM Serving", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you overlap a 4GB KV-cache transfer over 100 Gbps Ethernet to minimize time-to-first-token in disaggregated serving?", "chain_ids": ["cloud-chain-auto-022-05"], "chain_positions": {"cloud-chain-auto-022-05": 1}, "chain_tiers": {"cloud-chain-auto-022-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3964", "title": "Backward-Triggered Gradient Accumulation with Deferred AllReduce", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the overlap tradeoffs of deferring DDP AllReduce to only the last micro-step for K=4 gradient accumulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3965", "title": "H100 TMA (Tensor Memory Accelerator) for Async Memory Loads", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does H100 TMA improve compute-memory overlap compared with A100's cooperative shared-memory loads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3966", "title": "Stragglers and Overlap: Why the Slowest GPU Determines Step Time", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does one slow GPU destroy overall performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3967", "title": "Communication-Aware Model Architecture Design", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you choose the transformer dimensions and tensor-parallel layout to maximize communication-computation overlap on 1024 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3968", "title": "Overlapping Collective Permute with MoE Expert Computation on TPU", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can overlap still work between ICI transfers and expert computation on TPU v5e?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3969", "title": "Bucket Fusion Order and Its Effect on First-Bucket Overlap Latency", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you form DDP gradient buckets to improve AllReduce overlap for the 1MB output layer and 25MB intermediate layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3970", "title": "Cross-Mesh Communication Overlap in 3D Parallelism", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule TP, PP, and DP communication to maximize overlap without IB contention in this 512-GPU 3D-parallel step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3971", "title": "Overlap Measurement: MFU vs HFU and Communication Accounting", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the difference between MFU 45% and HFU 58%, and what do they imply about communication-computation overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3972", "title": "Overlap in Heterogeneous Pipeline: Different GPU Types per Stage", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can communication overlap mitigate the 3x A100/H100 pipeline-stage imbalance, and what should you do instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3973", "title": "Asynchronous Local SGD as an Alternative to Overlapped AllReduce", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do Local SGD at H=4 and H=16 trade communication savings against convergence penalties compared to overlapped AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3985", "title": "Training Budget Overrun From Sequence Length", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does switching from 2,048 to 8,192 context affect a 400,000 GPU-hour training budget when attention is 30% of step time?", "chain_ids": ["cloud-chain-auto-secondary-015-07"], "chain_positions": {"cloud-chain-auto-secondary-015-07": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-07": "secondary"}, "validated": true, "math_verified": true, "human_reviewed": {"status": "verified", "by": "expert", "date": "2026-04-28"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4009", "title": "ECMP Hash Polarization in Fat-Tree Topologies", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the root cause and what distinguishes ECMP hash polarization from simple oversubscription?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 0}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4010", "title": "PFC Deadlock in Lossless Ethernet Fabrics", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does PFC deadlock form in this Clos network, and how would you prevent full-fabric stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4011", "title": "Incast Congestion During AllReduce Reduce-Scatter", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What incast pattern is causing the 50ms reduce-scatter spikes, and why is reduce-scatter more vulnerable than all-gather?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4012", "title": "Designing ECN Thresholds for ML Traffic", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What ECN thresholds, PFC thresholds, and DCQCN rate-reduction parameters would you set for the 1024-MI300X 400G RoCE cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4013", "title": "Switch Buffer Sizing for ML Incast", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum shared buffer is needed to absorb the 8-to-1 incast burst for one 4μs RTT without packet drops?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4014", "title": "DCQCN vs TIMELY Congestion Control", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which congestion-control protocol, DCQCN or TIMELY, better serves this mixed AllReduce and pipeline-parallel workload, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4015", "title": "Buffer Management with Dynamic Thresholds", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you redesign switch buffer management so 4 elephant flows do not starve 60 mice flows on the 64-port 400G switch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4016", "title": "Congestion Spreading in Multi-Rail ML Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What rail-isolation requirements would prevent congestion on Rail 0 from degrading Rails 1-7 in the 4096-H100 8-rail cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4017", "title": "Flowlet Switching for AllReduce Load Balancing", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the inter-packet gap for 4KB packets at 400Gbps, and what flowlet timeout should you use for 256MB AllReduce messages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4018", "title": "Tail Latency Impact of PFC Storms", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the throughput impact of 2ms PFC storms once per minute per ToR in this 256-A100 synchronous AllReduce job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4019", "title": "Weighted ECMP for Heterogeneous Link Speeds", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does standard ECMP fail with one spine link degraded to 200G, and what routing changes are needed?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 1}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4020", "title": "Congestion Control for Pipeline-Parallel Microbatches", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design congestion control for 8MB activation transfers every 2ms to minimize 16-stage pipeline bubble time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4021", "title": "ECN Marking Accuracy at 400Gbps Line Rate", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How many ECN marking decisions per second are required at 400Gbps with 4KB packets, and why must the algorithm be simple?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4022", "title": "Victim Flow Starvation Under PFC", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does checkpoint traffic using only 15% of fabric bandwidth cause a 70% AllReduce throughput drop with PFC enabled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4023", "title": "Congestion-Aware Adaptive Routing on InfiniBand", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should Adaptive Routing be enabled for the mix of 90% AllReduce and 10% parameter-server traffic, and how should it be configured?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 3}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4024", "title": "Network Calculus for ML Traffic Guarantees", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Using network calculus concepts, what worst-case end-to-end delay bound do you derive for an 8MB activation tensor traversing 6 Clos hops with 2MB buffers?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4025", "title": "NCCL NET_PLUGIN Congestion Tuning", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How did lowering NCCL_IB_TIMEOUT to 14 and NCCL_IB_RETRY_CNT to 3 cause retransmissions, and what values should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4026", "title": "Slow Receiver Problem in RoCE ML Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does one MI300X NIC limited to 200Gbps affect a 64-GPU ring AllReduce, and why is this worse than in a tree?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4027", "title": "Micro-Burst Detection for ML Traffic", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What 400Gbps micro-burst can hide inside 5-second switch-counter samples, and how would you detect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4028", "title": "Congestion Control at 800G and Beyond", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can PFC, DCQCN, and ECN scale to 800Gbps links with 4KB packets, or is a fundamental redesign needed?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 3}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4029", "title": "Phantom Congestion from NCCL Tree AllReduce", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does tree AllReduce generate 40% more ECN-marked packets than ring AllReduce despite the same total data volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4030", "title": "Cross-Job Congestion Isolation in Multi-Tenant Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the network isolation architecture to prevent one job's AllReduce congestion from degrading another's performance.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4031", "title": "TPU v5e Congestion in ICI Fabric", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does congestion management for AllReduce differ on TPU v5e's ICI torus versus a switched Clos GPU fabric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4032", "title": "End-to-End Congestion Budget for Training Iteration", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate the <5% communication-overhead SLA budget across ECMP imbalance, ECN/DCQCN, PFC, retransmissions, and software?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4033", "title": "ECMP vs Adaptive Routing Tradeoff Space", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When is static ECMP preferable to Dynamic Load Balancing on Tomahawk-5 for MI300X training traffic, considering overhead?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 2}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4034", "title": "Ring AllReduce Bandwidth Formula", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the ring AllReduce time formula for N GPUs, bandwidth B, and gradient size D, and why is it near bandwidth-optimal?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 0}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4035", "title": "Ring vs Tree AllReduce Latency Tradeoff", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For 1024 MI300X GPUs synchronizing a 50MB tensor, which NCCL AllReduce algorithm is faster and where is the crossover point?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 1}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4036", "title": "Gradient Compression with Top-K Sparsification", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you use Top-K gradient sparsification at 1% density for 4GB gradients, and what happens to the dropped 99%?", "chain_ids": ["cloud-chain-auto-023-12"], "chain_positions": {"cloud-chain-auto-023-12": 1}, "chain_tiers": {"cloud-chain-auto-023-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4037", "title": "Bucket Fusion in NCCL AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With NCCL_BUFFSIZE=4MB, how many fused AllReduce operations does ResNet-152 need, and what speedup comes from reduced launch overhead?", "chain_ids": ["cloud-chain-auto-023-13"], "chain_positions": {"cloud-chain-auto-023-13": 0}, "chain_tiers": {"cloud-chain-auto-023-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4038", "title": "Diagnosing Gradient Staleness in Async SGD", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does average staleness of 12 iterations hurt async SGD convergence for a 13B LLM, and what staleness is tolerable?", "chain_ids": ["cloud-chain-auto-023-14"], "chain_positions": {"cloud-chain-auto-023-14": 0}, "chain_tiers": {"cloud-chain-auto-023-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4039", "title": "Hierarchical AllReduce Design for Multi-Rack Clusters", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What hierarchical AllReduce stages and algorithms would you use for the 512-GPU, 64-node, 8-rack cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4040", "title": "Gradient All-to-All for Expert Parallelism", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the MoE All-to-All token routing pattern across 128 MI300X GPUs, and how does its bandwidth compare to AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4041", "title": "Gradient Quantization to INT8", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What communication time savings and quantization error result from INT8-quantizing the 2 GiB FP16 gradients before AllReduce?", "chain_ids": ["cloud-chain-auto-023-12"], "chain_positions": {"cloud-chain-auto-023-12": 0}, "chain_tiers": {"cloud-chain-auto-023-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4042", "title": "AllReduce Communication Hiding with Computation Overlap", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With a 200ms backward pass and 60ms AllReduce on 128 H100s, what maximum overlap and iteration time can you achieve?", "chain_ids": ["cloud-chain-auto-023-13"], "chain_positions": {"cloud-chain-auto-023-13": 1}, "chain_tiers": {"cloud-chain-auto-023-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4043", "title": "Local SGD vs AllReduce at Scale", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate wall-clock time, convergence, and network utilization to determine whether AllReduce or Local SGD is superior for this run.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4044", "title": "Recursive Halving-Doubling AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How does recursive halving-doubling latency compare to ring AllReduce, and why is it preferred for small messages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4045", "title": "Gradient Accumulation as Communication Reduction", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 4x micro-batch accumulation on 64 GPUs, what are the effective batch size, communication overhead reduction, and memory cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4046", "title": "PowerSGD Low-Rank Gradient Compression", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "For PowerSGD rank 4 on a 4096×4096 gradient, what are the compression ratio, AllReduce savings on 256 GPUs, and approximation error?", "chain_ids": ["cloud-chain-auto-023-12"], "chain_positions": {"cloud-chain-auto-023-12": 2}, "chain_tiers": {"cloud-chain-auto-023-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4047", "title": "Gradient Synchronization for ZeRO Stage 3", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What is the ZeRO-3 per-layer communication schedule and cost for a 64-GPU, 30B-parameter, 60-layer training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4048", "title": "Detecting Silent Gradient Corruption", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you detect which GPU or network path is producing corrupted gradients?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4049", "title": "AllReduce Bandwidth Efficiency on NVLink vs RoCE", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For a 64KB gradient, what are the intra-node NVLink and inter-node RoCE AllReduce times, and where is the latency crossover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4050", "title": "SHARP In-Network AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 512 H100s, what benefits and limitations does SHARP have versus host-based NCCL AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4051", "title": "Mixed-Precision Gradient Synchronization Pipeline", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you redesign the BF16-gradient Adam pipeline to minimize AllReduce communication while preserving FP32 optimizer precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4052", "title": "Gradient Synchronization for FSDP with Activation Checkpointing", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "With activation checkpointing, what total FSDP communication volume and scheduling constraints apply for the 65B LLM on 256 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4054", "title": "AllReduce Algorithm Selection in NCCL", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which AllReduce algorithm should you choose for 64KB, 256MB, and 4GB gradients on 2048 MI300X GPUs, and why under LogP?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 3}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4055", "title": "TPU v5e AllReduce over ICI Torus", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does the AllReduce time for a 1GB gradient on this torus compare versus 256 H100s connected via 400Gbps switched Ethernet, and why does the torus topology affect algorithm selection differently than a Clos network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4056", "title": "Gradient Clipping Interaction with AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What overhead does the scalar norm AllReduce add for global gradient clipping on 256 MI300X GPUs, and can it be overlapped?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4057", "title": "DiLoCo: Distributed Low-Communication Training", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does DiLoCo with H=500 compare with synchronous AllReduce for a 70B model on 256 GPUs in communication, time, and convergence?", "chain_ids": ["cloud-chain-auto-023-14"], "chain_positions": {"cloud-chain-auto-023-14": 2}, "chain_tiers": {"cloud-chain-auto-023-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4058", "title": "AllReduce on Non-Power-of-Two GPU Counts", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What AllReduce strategy should you use for 497 GPUs, and why might ring still beat a non-power-of-two recursive scheme?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4059", "title": "InfiniBand Architecture Fundamentals", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are HCAs, subnet managers, queue pairs, and virtual lanes in InfiniBand, and how do they enable RDMA on a 256-H100 cluster?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 0}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4060", "title": "RoCE v2 vs InfiniBand for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 1024-MI300X cluster, how do InfiniBand NDR and RoCE v2 compare in congestion control, multi-tenancy, operations, and recovery?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 1}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4061", "title": "Zero-Copy RDMA for Gradient Transfer", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What zero-copy GPUDirect RDMA data path would you use for gradient AllReduce between two GPUs in different nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4062", "title": "Kernel Bypass and Verbs API Overhead", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 65,536 TCP packets carrying a 256MB transfer, how much context-switch and kernel overhead does RDMA bypass save?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 0}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4063", "title": "GPUDirect RDMA Memory Registration Overhead", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is NCCL memory registration causing the 500ms startup delay, and what are the correct MTT and QP setup costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4064", "title": "RDMA Write vs RDMA Send for AllReduce", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why does NCCL prefer RDMA Write over Send/Receive for a 256MB gradient transfer, and how much overhead does it avoid?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4065", "title": "PCIe Bandwidth Bottleneck for GPUDirect RDMA", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the GPUDirect RDMA bottleneck on H100 SXM, and how much headroom remains between PCIe Gen5 x16 and a 400Gbps NIC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4066", "title": "Designing RDMA Buffer Pool for AllReduce Pipelining", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you allocate the RDMA buffer pool for a 2GB, 8-stage pipelined ring AllReduce on 256 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4067", "title": "RDMA Completion Queue Polling vs Interrupts", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 1000 AllReduce iterations per second on 64 GPUs, what CPU-cycle overhead does CQ polling use versus interrupts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4068", "title": "RoCE v2 Packet Format and GRH Overhead", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a 4KB RDMA Write payload, what are the RoCE v2 header overhead and payload efficiency compared with native InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4069", "title": "GPUDirect RDMA Page Table Alignment", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 2KB alignment hurt MI300X GPUDirect RDMA throughput, and how much loss comes from PCIe TLP splitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4070", "title": "RDMA Reliable Connection vs Unreliable Datagram", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For AllReduce on 512 GPUs, how do InfiniBand RC and UD transports trade off QP scalability, failures, and throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4071", "title": "RDMA Memory Registration with On-Demand Paging", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What GPUDirect RDMA on-demand paging architecture would let PyTorch allocate GPU memory dynamically without pre-registration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4072", "title": "GPUDirect RDMA vs GPUDirect Storage Data Paths", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do concurrent GPUDirect Storage checkpoint saves affect AllReduce throughput on a 64-MI300X cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4073", "title": "RDMA Multi-Path with Bonded NICs", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you use all 8 ConnectX-7 rails for a 2GB AllReduce while handling rail failures and preserving reduction ordering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4074", "title": "RDMA Latency Breakdown for a Single Write", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency breakdown for a 64-byte GPUDirect RDMA Write between two H100 nodes through one IB switch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4075", "title": "NIC-Level Traffic Shaping for RDMA Fairness", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design NIC-level traffic shaping so Job A's AllReduce does not starve Job B's latency-sensitive activation transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4076", "title": "iWARP vs RoCE v2 for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How suitable is iWARP versus RoCE v2 for large-scale ML training, considering latency, throughput, CPU overhead, and operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4077", "title": "RDMA Atomic Operations for Distributed Locking", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Under 256-way contention on a single RDMA CAS lock, what is the maximum lock throughput and why do atomics bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4078", "title": "Ultra Ethernet Consortium for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would UEC's packet spraying, selective retransmission, and multipath transport compare with RoCE v2 and InfiniBand at 8192 MI300X GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4079", "title": "RDMA Error Recovery in Long-Running Training Jobs", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What RDMA error recovery architecture allows a 90-day, 1024-H100 training run to continue with minimal interruption during QP failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4080", "title": "GDRCopy for Small RDMA Messages", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 64-byte NCCL control messages, how do GDRCopy and GPUDirect RDMA latencies compare, and why use GDRCopy anyway?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4081", "title": "RDMA Connection Scaling for 10K GPU Clusters", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the QP state memory and setup-time costs for naive RDMA at 16,384 GPUs, and what sub-quadratic design would you use?", "chain_ids": ["cloud-chain-auto-020-03"], "chain_positions": {"cloud-chain-auto-020-03": 1}, "chain_tiers": {"cloud-chain-auto-020-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4082", "title": "RDMA Performance Isolation with SR-IOV", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the SR-IOV RDMA overhead versus bare metal on one 400G ConnectX-7 NIC, and how many tenants can share it above 90% AllReduce throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4083", "title": "RDMA with CXL Memory Expansion for ML", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is CXL-attached memory a good RDMA gradient-buffer design for a 512-GPU cluster, and why or why not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4084", "title": "Column-Parallel Linear Layer Communication Pattern", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "After a column-parallel linear layer on 8xMI300X, what collective, if any, is needed before the next layer can consume the partial outputs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4085", "title": "TP Communication Volume for Transformer Block", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For hidden=8192, TP=8, and batch=2048 on MI300X, what is the per-block AllReduce volume and time for the two forward collectives?", "chain_ids": ["cloud-chain-auto-022-07"], "chain_positions": {"cloud-chain-auto-022-07": 1}, "chain_tiers": {"cloud-chain-auto-022-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4086", "title": "Why TP Stays Within a Node", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is this almost certainly a bad idea, and what would you recommend instead?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 0}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4087", "title": "TP Layout Under an Asymmetric Bandwidth Budget", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "On 4 GPUs with a 2x NVLink pair + 2x PCIe pair, should the [8192, 8192] linear block use column-parallel + AllGather or row-parallel + AllReduce, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4088", "title": "Sequence Parallelism in Megatron-LM", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For seq_len=8192, batch=4, hidden=8192, TP=8, how much activation memory is wasted, and how does sequence parallelism fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4089", "title": "TP Degree Selection for Inference", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use TP=1 with a single GPU or TP=2 across two GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4090", "title": "AllReduce vs AllGather Cost in TP", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For M bytes across P GPUs, what are the ring AllReduce vs AllGather communication volumes, and which operations do Megatron's row and column parallel layers use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4091", "title": "TP and Attention Head Partitioning", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With 32 attention heads on 8-GPU nodes, can you use TP=8 or TP=6, and what head-divisibility constraint governs TP degree?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4092", "title": "TP Memory Savings Calculation", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With a 30B BF16 model and TP=4 on A100-80GB GPUs, how much weight and AdamW optimizer-state memory does each GPU hold?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4093", "title": "TP Bubble: Synchronization Overhead in Practice", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What two concrete strategies would you use to reduce the 15% TP AllReduce overhead without changing TP=8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4094", "title": "GQA Impact on Tensor Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How are the KV heads distributed, and what problem arises if you try TP=16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4095", "title": "TP vs Expert Parallelism for MoE Models", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which parallelism strategy is superior for this MoE model and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4096", "title": "Embedding Table Parallelism Strategy", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For a 3 GB embedding table with TP=8, what are two valid sharding strategies and their tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4097", "title": "Backward Pass Communication in TP", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "In Megatron's fused column->row block, what TP communications occur in backward, and what is the per-block volume for a full training step?", "chain_ids": ["cloud-chain-auto-022-07"], "chain_positions": {"cloud-chain-auto-022-07": 2}, "chain_tiers": {"cloud-chain-auto-022-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4098", "title": "TP with Activation Checkpointing Interaction", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "During full activation checkpointing with TP=8, which forward TP collectives are recomputed in backward, and does TP communication increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4099", "title": "TP Load Imbalance from Uneven Head Counts", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "If TP=8 partitions the 48-head model correctly, what can cause slight MLP execution-time imbalance across MI300X ranks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4101", "title": "TP Scaling Efficiency Cliff", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 13B model on H100 NVLink, how does MFU change from TP=1 to TP=8, and at what TP degree does efficiency drop sharply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4102", "title": "TP for Cross-Attention in Encoder-Decoder Models", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With TP=4 and partitioned encoder outputs, how should cross-attention KV projections be handled without adding extra communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4103", "title": "Debugging TP Numerical Divergence", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "After switching from TP=1 to TP=4, what are the most likely causes of loss divergence after 500 steps, and how would you systematically debug it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4104", "title": "Context Parallelism vs Sequence Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the difference between sequence parallelism and context parallelism, and when would you use each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4105", "title": "TP Communication Overlap with NVLink", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What CUDA stream setup is needed to overlap TP AllReduce and GEMM on the 8x A100 DGX node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4106", "title": "TP Degree for Inference Latency SLA", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the minimum TP degree needed to meet this SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4107", "title": "TP Weight Sharding Memory Layout", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "For TP=8 inference, how are Megatron-style TP-sharded checkpoints stored and what are the two ways to load each GPU's shard?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4108", "title": "TP + Pipeline Parallelism Interaction", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What TP, PP, and DP layout would you design for 405B training on 128 GPUs, and what throughput should you expect?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 3}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4109", "title": "GPU Scheduling: FIFO vs Shortest-Job-First", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What scheduling policy would reduce the 4-hour FIFO wait for small inference jobs, and what tradeoff does it introduce?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 0}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4110", "title": "Bin-Packing GPUs for Mixed Workloads", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does first-fit GPU allocation produce low utilization for 1-, 2-, 4-, and 8-GPU jobs, and how does bin-packing improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4111", "title": "MIG Partitioning for Inference Multiplexing", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you use MIG to partition 4 A100-80GB GPUs to run 12 concurrent <7B model instances with isolation?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 2}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4112", "title": "Gang Scheduling for Distributed Training", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What scheduling concept prevents this, and why is it critical for distributed training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4113", "title": "Fair-Share Scheduling Across Teams", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should fair-share scheduling handle Team B borrowing Team A's idle GPUs, and what happens when Team A submits a large job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4114", "title": "GPU Preemption for Priority Inference", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you preempt training jobs to free 8 MI300X GPUs for urgent inference while minimizing wasted compute and meeting the SLA?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 2}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4115", "title": "GPU Time-Slicing vs MIG vs MPS", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do CUDA time-slicing, MIG, and MPS differ in isolation, overhead, and best use case?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 1}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4116", "title": "Spot Instance Strategy for Training Resilience", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What checkpointing strategy maximizes cost savings while minimizing wasted compute for a 13B model on 32 spot instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4117", "title": "Multi-Tenant GPU Cluster Quota Design", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design quotas, scheduling policies, and safeguards for a 500-GPU cluster shared by 8 ML teams?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4118", "title": "Topology-Aware Scheduling on DGX H100", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does NVLink topology matter for placement on this node, or are all 4-GPU subsets equivalent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4119", "title": "GPU Memory Oversubscription Dangers", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If two 30 GB and 35 GB models share one 80 GB GPU and see concurrent peak traffic, what happens and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4120", "title": "Scheduler Interaction with NCCL Timeout", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can stale CUDA processes on 4 allocated GPUs cause a 64-GPU NCCL timeout, and what scheduler checks prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4121", "title": "Elastic Training with Dynamic GPU Scaling", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you safely shrink the 64-GPU training job to 48 GPUs without restarting from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4122", "title": "GPU Utilization Monitoring and Right-Sizing", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are three common causes of <20% GPU utilization, and what right-sizing action would you take for each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4123", "title": "Scheduling Deadline-Aware Training Jobs", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can 128 GPUs train the 70B model on 1T tokens within 14 days, and how would you schedule around the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4124", "title": "Power-Aware GPU Scheduling", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you adjust GPU scheduling to stay within the 250 kW thermal rejection limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4125", "title": "Fragmentation-Aware Scheduling Policy", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What scheduling policy would prevent the 16-GPU job from being blocked by fragmented free GPUs?", "chain_ids": ["cloud-chain-auto-021-07"], "chain_positions": {"cloud-chain-auto-021-07": 2}, "chain_tiers": {"cloud-chain-auto-021-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4126", "title": "Multi-Cluster GPU Federation Scheduling", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a federated scheduler to route jobs across the clusters based on job characteristics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4127", "title": "Job Priority Inversion in GPU Clusters", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you resolve this without wasting the training compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4128", "title": "GPU Health Monitoring for Scheduler Integration", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which five GPU health metrics should the scheduler monitor, and what thresholds should trigger draining a GPU from the pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4129", "title": "MIG vs Full GPU for Inference Cost Optimization", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does the cost per token compare between a full GPU and 3g.40gb MIG partitions for the 7B service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4130", "title": "Scheduling Multi-Node Training with Network Constraints", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the scheduler consider network topology to optimize the 128-GPU job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4131", "title": "GPU Lease Duration and Scheduling Efficiency", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What wall-time policy would improve efficiency when 7-day MI300X jobs only use 15% of their reserved runtime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4132", "title": "Scheduling for Heterogeneous GPU Clusters", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a heterogeneous scheduler to maximize throughput-per-dollar across H100, MI300X, and A100 GPUs for a mixed workload of LLM training, inference, and fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4133", "title": "Slurm GPU Resource Configuration", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What key slurm.conf and gres.conf entries enable GPU-aware, topology-aware scheduling for the 10-node H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4134", "title": "Kubernetes GPU Device Plugin Architecture", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "When a pod requests nvidia.com/gpu: 2, what is the sequence of events from pod scheduling to GPU availability inside the container?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4135", "title": "Node Affinity for GPU Topology in K8s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What pod spec would ensure the FP8 training job is placed only on appropriate nodes with the right tolerations and GPU requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4136", "title": "Gang Scheduling in Kubernetes with Volcano", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does Volcano scheduler solve this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4137", "title": "MIG Partitioning in Kubernetes", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you configure MIG on the node and have pods request the specific 3g.40gb partitions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4138", "title": "K8s Resource Limits vs Requests for GPU Workloads", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why this asymmetry, and what does it mean for GPU scheduling?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 1}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4139", "title": "Multi-Node Training Job Orchestration in K8s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which orchestration approach is most appropriate for the multi-node PyTorch job and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4140", "title": "GPU Node Autoscaling in Kubernetes", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design autoscaling to handle traffic spikes without 5-minute cold starts?", "chain_ids": ["cloud-chain-auto-021-13"], "chain_positions": {"cloud-chain-auto-021-13": 1}, "chain_tiers": {"cloud-chain-auto-021-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4141", "title": "Container Image Optimization for GPU ML Workloads", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you reduce the pod startup time to under 1 minute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4142", "title": "RDMA and Host Networking for ML Training Pods", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the Kubernetes networking bottleneck limiting NCCL to 15 GB/s, and how would you configure K8s for full InfiniBand bandwidth?", "chain_ids": ["cloud-chain-auto-021-12"], "chain_positions": {"cloud-chain-auto-021-12": 2}, "chain_tiers": {"cloud-chain-auto-021-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4143", "title": "K8s Pod Failure Semantics for Distributed Training", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What happens next to the other 31 pods, and how should the PyTorchJob be configured to handle this correctly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4144", "title": "Persistent Volume Strategy for Training Checkpoints", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you configure PersistentVolumes for 26 GB checkpoints every 30 minutes over a 3-day, 32-GPU job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4145", "title": "GPU Operator vs Manual Driver Management in K8s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the GPU Operator manage that manual installation does not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4146", "title": "K8s Scheduling Latency for Real-Time Inference", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What caused the 30-second 503 gap during rolling updates, and how would you deploy GPU inference pods with zero downtime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4147", "title": "P99 vs P50 Divergence Under Load", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is P50-to-P99 divergence a poor signal for capacity planning, and what root causes should you investigate before scaling out?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4148", "title": "Little's Law for GPU Inference Throughput", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using Little's Law, how many concurrent requests must be in-flight, and what does this imply for GPU memory reserved for KV cache?", "chain_ids": ["cloud-chain-auto-025-16"], "chain_positions": {"cloud-chain-auto-025-16": 0}, "chain_tiers": {"cloud-chain-auto-025-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4149", "title": "Hedged Requests in Distributed Inference", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected P99 improvement for hedged requests, and what is the cost in wasted GPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4150", "title": "Load Balancing Algorithms and Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does round-robin fail here, and what load-balancing strategy would you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4151", "title": "P999 Latency in Multi-Stage Pipelines", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is the end-to-end P99 380ms instead of 200ms, and how would you allocate per-stage tail-latency budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4152", "title": "Prefill-Decode Latency Decomposition", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For the 4096-token prompt and 256-token generation, what are total latency and TTFT, and how do prefill and decode optimizations differ?", "chain_ids": ["cloud-chain-auto-025-17"], "chain_positions": {"cloud-chain-auto-025-17": 0}, "chain_tiers": {"cloud-chain-auto-025-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4153", "title": "Coordinated Omission in Latency Measurement", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is coordinated omission, and how does the benchmark's pause-while-waiting behavior mask true tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4154", "title": "Tail Tolerance with Backup Requests at Scale", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design backup requests to bring P999 below 200ms on the 64-GPU MI300X service, and what extra GPU cost would that add?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4155", "title": "Queueing Theory Applied to GPU Batch Scheduling", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using M/D/1, what is the expected wait at 20ms batching, and why can a 50ms batch window improve P99 despite more queueing?", "chain_ids": ["cloud-chain-auto-025-16"], "chain_positions": {"cloud-chain-auto-025-16": 1}, "chain_tiers": {"cloud-chain-auto-025-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4156", "title": "Fan-Out Tail Amplification in Mixture-of-Experts", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a batch of 128 tokens, each needing different expert pairs, what is the expected P99 of the full batch, and how do you mitigate the amplification?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4157", "title": "Tail Latency Budgeting Across Microservices", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What per-service latency budgets and alert thresholds would you set for the tokenizer, H100 inference, and post-processing services?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4158", "title": "NUMA Effects on Inference Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware topology issue explains this bimodal distribution, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4159", "title": "Adaptive Timeout Design for LLM Serving", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design an adaptive timeout strategy that protects long requests and catches short failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4160", "title": "Goodput vs Throughput Under Tail Constraints", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does goodput drop when batch size increases from 32 to 64, and what batch size should you choose?", "chain_ids": ["cloud-chain-auto-025-16"], "chain_positions": {"cloud-chain-auto-025-16": 2}, "chain_tiers": {"cloud-chain-auto-025-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4161", "title": "Tail Latency Impact of Garbage Collection", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does GC disproportionately affect tail latency in ML serving, and what are your mitigation options?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4162", "title": "Little's Law Under Variable Service Times", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What average concurrency does Little's Law predict, and why does service-time variance make P99 queue depth much worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4163", "title": "Speculative Decoding as Tail Latency Reducer", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "With 5 draft tokens at a 72% acceptance rate, what are the effective accepted tokens per step, the latency per accepted token, and the total speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4164", "title": "Cross-Region Tail Latency in Geo-Distributed Serving", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the 3-region MI300X serving system to meet a global P99 SLA of 150 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4165", "title": "Request Coalescing vs Tail Latency Tradeoff", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At what request rate does coalescing become net-negative for goodput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4166", "title": "Continuous Batching and Tail Latency Reduction", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using PagedAttention, what is the size of a single 16-token KV-cache page for a model with 80 layers, 16 KV heads, 128 head dimension, and FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4167", "title": "Fat-Tree Bisection Bandwidth for AllReduce", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the ring AllReduce time for a 1 GB gradient on 256 GPUs over 400 Gbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4168", "title": "NVLink vs Infinity Fabric for Tensor Parallelism", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural differences between NVLink and Infinity Fabric affect tensor parallel efficiency, and which do you recommend given the per-GPU FP16 parameter shard size versus activation shard AllReduce transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4169", "title": "Dragonfly Topology for Large-Scale Training", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which topology better matches the hierarchical communication pattern, and what is the cost difference in switch ports?", "chain_ids": ["cloud-chain-auto-026-09"], "chain_positions": {"cloud-chain-auto-026-09": 1}, "chain_tiers": {"cloud-chain-auto-026-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4170", "title": "Torus Topology and Nearest-Neighbor Communication", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you compute the 3D torus bisection bandwidth for the 256-chip TPU pod, and why is it 12.8 Tbps rather than 6.4 Tbps?", "chain_ids": ["cloud-chain-auto-026-09"], "chain_positions": {"cloud-chain-auto-026-09": 0}, "chain_tiers": {"cloud-chain-auto-026-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4171", "title": "CXL Memory Pooling for Inference Serving", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you tier the 405B model's weights and KV cache between local HBM and the 2 TB CXL pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4172", "title": "NVSwitch Full-Mesh vs PCIe Hierarchy", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the effective AllReduce bandwidth ratio between NVSwitch and PCIe in an 8-GPU system, and which workloads justify the premium?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4173", "title": "Infinity Fabric Coherency Cost in Multi-Die MI300X", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the effective bandwidth for memory accesses that hit HBM (5.3 TB/s) 60% of the time and DRAM (0.8 TB/s) 40% of the time, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4174", "title": "Network Topology for Pipeline Parallelism", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you map pipeline stages to GPUs, and what bandwidth should you use for a single InfiniBand NDR link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4175", "title": "Rail-Optimized Network for GPU Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the rail-optimized topology, what is its switch count vs a fat-tree, and which AllReduce algorithm maps best to it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4176", "title": "Network Partitioning for Failure Isolation", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which 3D parallelism dimension is most affected by the 512-GPU partition, and how would you design the topology to contain failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4178", "title": "Topology-Aware AllReduce Algorithm Selection", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare ring AllReduce, hierarchical AllReduce, and NCCL's tree AllReduce; which algorithm minimizes total AllReduce time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4179", "title": "Cross-Rack Bandwidth Planning for Data Parallelism", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Will the cross-rack bandwidth support AllReduce, and if not, how do you fix it without adding spine bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4180", "title": "Network Topology Impact on Checkpoint I/O", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the checkpoint write time to the 2×100 GbE NFS server, and what topology-aware checkpointing strategy reduces stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4181", "title": "Multi-Tenant Network Isolation on Shared Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition the network and set QoS so training AllReduce bursts do not cause inference latency spikes?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 3}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4182", "title": "PCIe Gen5 vs NVLink for Mixed Workloads", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 4-way tensor-parallel AllReduce time for a 100 MB activation tensor, and which link is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4183", "title": "Adaptive Routing in Dragonfly Under Adversarial Traffic", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How much traffic does the 1024-GPU 4 GB AllReduce put on each global link, and why can minimal routing cause interference?", "chain_ids": ["cloud-chain-auto-026-09"], "chain_positions": {"cloud-chain-auto-026-09": 2}, "chain_tiers": {"cloud-chain-auto-026-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4184", "title": "NVLink Domain Size and Scaling Limits", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What AllReduce speedup comes from scaling the NVLink domain from 8 to 72 GPUs for a 405B model, and why do returns diminish beyond 72?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4185", "title": "SM Occupancy vs Achieved Throughput", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can a 50%-occupancy H100 kernel reach 85% peak TFLOPS while a 75%-occupancy kernel reaches only 40%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4186", "title": "Warp Divergence in Attention Masking", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Explain why causal masking causes warp divergence and how FlashAttention's tiling strategy eliminates it. With thread coarsening at stride 2048, which sequence positions does thread 49 process?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4187", "title": "Tensor Core Utilization vs CUDA Core Fallback", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the unfused runtime breakdown for the $4096 \\times 4096$ matmul plus GELU, and why is the activation function a massive bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4188", "title": "Warp Scheduling and Latency Hiding", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With 100 active cycles in a 310-cycle interval, what memory-stall fraction does the kernel experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4189", "title": "Tensor Core Matrix Multiply Tile Sizes", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many FLOPs do 352,256 Tensor Core tiles at 8,192 FLOPs each execute, and is the result measured in GFLOPs or TFLOPs?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4190", "title": "Memory Coalescing in Attention Kernels", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does column-wise access to row-major Q/K data destroy coalescing on MI300X, and how do you fix it?", "chain_ids": ["cloud-chain-auto-secondary-010-21"], "chain_positions": {"cloud-chain-auto-secondary-010-21": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4192", "title": "Thread Block Size Optimization for H100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the max blocks per SM and occupancy at 128 threads per block, and would 256 threads per block improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4193", "title": "Kernel Fusion Strategy for Transformer Blocks", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you fuse the 12 transformer-block kernels to minimize HBM round-trips without breaking Tensor Core tile efficiency?", "chain_ids": ["cloud-chain-auto-secondary-010-21"], "chain_positions": {"cloud-chain-auto-secondary-010-21": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4194", "title": "CDNA vs CUDA SM Architecture Comparison", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural differences explain why MI300X's 304 CUs excel at large batch training while H100's 132 SMs excel at low-latency inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4195", "title": "L2 Cache Partitioning for Multi-Model Serving", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you partition H100's 50 MB L2 cache across Models A, B, and C to protect Model A's P99 latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4196", "title": "Wave Quantization in Kernel Launch", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the efficiency loss of the 140-block kernel, and what is the optimal number of thread blocks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4197", "title": "FP8 Tensor Core Precision on H100 and MI300X", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What H100 FP8 throughput should you use for mixed-precision training when FP16 is 989 TFLOPS, and which FP8 format is best for gradients?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4199", "title": "Feature Store Online vs Offline Latency Gap", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does this fail, and what latency math definitively proves it is unviable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4200", "title": "Feature Freshness vs Staleness Budget", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the staleness problem and what freshness architecture fixes it without rebuilding the entire pipeline?", "chain_ids": ["cloud-chain-auto-020-15"], "chain_positions": {"cloud-chain-auto-020-15": 1}, "chain_tiers": {"cloud-chain-auto-020-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4201", "title": "Point-in-Time Correctness for Training", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What data leakage pattern is occurring, and how do you architect point-in-time correct feature retrieval?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 1}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4202", "title": "Feature Store Serving Throughput Under Load", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total Redis operations per second does this ad ranking feature workload require, can one 300K ops/s instance handle it, and how would you scale?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 1}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4204", "title": "Feature Store Schema Evolution", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you manage this schema evolution without breaking production models or requiring simultaneous retraining of all 12?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4205", "title": "Feature Store Cost Optimization on H100 Clusters", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compare DynamoDB, spot Spark, and feature pruning to cut the $120K/month feature store spend by 40% without increasing latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4206", "title": "Feature Serving Architecture for Multi-Model Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the feature serving layer to fetch 800 unique features for 30 models within a 10 ms p99 budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4207", "title": "Training-Serving Skew Detection", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes this massive training-serving skew despite a shared feature store, and how can it be prevented?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4208", "title": "Feature Store Backfill Strategy", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you backfill the feature store efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4209", "title": "Feature Store for Real-Time ML on A100 Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a prefetch architecture to overlap the 8ms feature fetch with 3ms GPU inference and raise utilization above 80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4210", "title": "Feature Store Entity Key Design", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What entity key do you use, and what are the storage implications compared to user-level features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4211", "title": "Feature Store Monitoring and Drift Detection", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor drift across 800 hourly features without adding 45 minutes to the 15-minute pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4212", "title": "Feature Store Migration from Monolith", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you execute this migration without disrupting production?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4213", "title": "Feature Store Consistency Guarantees", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the dual-write feature store diverge, and what architecture provides eventual consistency with bounded staleness?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 3}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4214", "title": "Embedding Feature Serving at Scale", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can Redis handle this workload, and what are the alternatives if not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4215", "title": "Parquet vs TFRecord for Training Throughput", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which format, Parquet or TFRecord, should you use for the 5 TB image dataset to sustain 10 GB/s on 8× A100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4216", "title": "Columnar vs Row Format for Feature Tables", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much I/O does a columnar format save versus a row format, and what enables this savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4217", "title": "WebDataset for Distributed Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does WebDataset solve this, and what shard size do you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4218", "title": "Compression Tradeoffs for ML Data Pipelines", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which option (Snappy, Zstd, or uncompressed) best balances cost and performance for 5 epochs over the 10 TB Parquet dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4219", "title": "Storage Format for Streaming vs Random Access", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What storage format property causes this extreme random access latency, and how do you fix it without duplicating the dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4220", "title": "Parquet Row Group Sizing for ML Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's wrong with the row group size, and what should it be to fix the I/O bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-007-02"], "chain_positions": {"cloud-chain-auto-secondary-007-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4221", "title": "Delta Lake vs Parquet for ML Versioning", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What properties does Delta Lake add over raw Parquet to solve these issues, and what are the performance costs?", "chain_ids": ["cloud-chain-auto-secondary-007-02"], "chain_positions": {"cloud-chain-auto-secondary-007-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4222", "title": "Storage Format Selection for Multimodal Datasets", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compare separate modality files, WebDataset tar shards, and Lance for the 80 TB multimodal dataset on throughput, storage, and ops complexity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4223", "title": "Tokenized Dataset Storage Format", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which token storage format—Parquet, raw memmap, or Arrow IPC—best optimizes 200B-token LLM training on 8x H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4224", "title": "Storage Format Impact on Shuffle Performance", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare chunk-based, index-based, and streaming shuffle for a 4 TB dataset; which minimizes time-to-first-batch while maintaining statistical quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4225", "title": "Petabyte-Scale Format Migration Strategy", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What format choices, migration strategy, and rollback plan would you use to standardize 5 PB across 200 ML datasets within 6 months?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4226", "title": "Storage Format for Streaming Inference Logs", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a low-cost storage pipeline for 500K LLM inference logs per second at 3.5 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4227", "title": "DP-SGD Training Cost on H100", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much slower will DP-SGD be than 4-hour standard SGD for BERT-base on 8× H100s, and what is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4229", "title": "Privacy-Utility Tradeoff in LLM Fine-Tuning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is full DP-SGD viable for GPT-2 medical NER at ε=1, and what alternatives improve the privacy-utility tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4231", "title": "DP Composition Across Multiple Model Releases", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What total privacy loss do basic, advanced, and RDP composition give after 12 ε=3 releases, and when must releases stop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4232", "title": "PII Detection and Scrubbing Pipeline", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With 500M documents averaging 2 PII instances each, how many PII instances leak through, and what is the defense-in-depth strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4233", "title": "DP Synthetic Data Generation", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What epsilon provides useful synthetic data, and how do you validate that the synthetic images are both private and medically useful?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4234", "title": "GDPR Right to Erasure with DP Training", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Does DP-SGD at ε=5 satisfy GDPR erasure, and if not, what does retraining cost and what unlearning alternatives exist?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4235", "title": "DP-SGD Hyperparameter Tuning Without Spending Privacy Budget", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you tune hyperparameters without exhausting the privacy budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4236", "title": "Estimating activation memory for backward pass on H100 with large batch", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Do the batch-32, seq-2048 activations for a 7B BF16 transformer fit in 80 GB HBM without checkpointing?", "chain_ids": ["cloud-chain-auto-secondary-007-18"], "chain_positions": {"cloud-chain-auto-secondary-007-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4237", "title": "Debugging a graph break in torch.compile during transformer training", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is torch.compile only giving an 8% speedup, and how would you fix the 47 graph breaks?", "chain_ids": ["cloud-chain-auto-secondary-007-16"], "chain_positions": {"cloud-chain-auto-secondary-007-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4238", "title": "Implementing a numerically stable custom backward pass for a fused loss function", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement a numerically stable forward and backward for this fused cross-entropy autograd Function on MI300X?", "chain_ids": ["cloud-chain-auto-secondary-007-17"], "chain_positions": {"cloud-chain-auto-secondary-007-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4239", "title": "Understanding why torch.compile retraces after dynamic batch size changes", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What causes the 3-5 second torch.compile stalls with variable padded batch sizes, and how would you eliminate them?", "chain_ids": ["cloud-chain-auto-secondary-007-16"], "chain_positions": {"cloud-chain-auto-secondary-007-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4240", "title": "Quantifying activation checkpointing trade-off for LLM pretraining on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much memory does checkpointing every other transformer block save, and what compute overhead does it add on MI300X?", "chain_ids": ["cloud-chain-auto-secondary-007-18"], "chain_positions": {"cloud-chain-auto-secondary-007-18": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4241", "title": "Designing a custom autograd function for a differentiable rendering operation", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you wrap the custom H100 ray-marching CUDA kernel so gradients flow through PyTorch autograd?", "chain_ids": ["cloud-chain-auto-secondary-007-17"], "chain_positions": {"cloud-chain-auto-secondary-007-17": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4242", "title": "Tracing vs. capture: choosing between torch.jit.trace and torch.compile for production", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which PyTorch graph/deployment path would you use for the static BERT transformer and data-dependent post-processing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4243", "title": "Memory layout of autograd computational graph for second-order optimization on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compute K-FAC second-order gradients for ResNet-152 on MI300X without unbounded graph memory growth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4244", "title": "Profiling autograd overhead in a training loop to identify bottlenecks", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and reduce the 280 ms backward time in this dynamic GNN on H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4245", "title": "Differentiating through a sort operation for learning-to-rank on TPU", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you make the NDCG-style ranking loss differentiable so gradients flow back through item scores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4246", "title": "Managing gradient accumulation with mixed precision and autocast on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should you structure BF16 autocast and gradient accumulation over 8 micro-batches on MI300X to avoid NaNs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4247", "title": "Understanding the autograd graph lifecycle and preventing memory leaks", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the most likely cause of the ~200 MB per-step GPU memory growth, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-007-18"], "chain_positions": {"cloud-chain-auto-secondary-007-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4248", "title": "Implementing gradient checkpointing for a custom attention mechanism", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you checkpoint the 100K-token chunked attention so backward does not recompute shared KV projections?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4249", "title": "Analyzing the computational graph captured by TorchDynamo for a complex model", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you inspect and fix the torch.compile graph breaks or missed fusion in the H100 MoE routing path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4250", "title": "Handling in-place operations and their effect on the autograd graph", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does in-place ReLU intermittently break autograd here, and what should the team change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4251", "title": "Optimizing backward pass memory for a contrastive learning model on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compute the 4096×4096 CLIP contrastive loss without materializing O(N²) gradients and OOMing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4252", "title": "Diagnosing NaN gradients in a deep network with custom autograd operations", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you stabilize and debug the custom SDF backward that divides by |∇f|?", "chain_ids": ["cloud-chain-auto-secondary-007-17"], "chain_positions": {"cloud-chain-auto-secondary-007-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4253", "title": "Comparing forward-mode vs reverse-mode AD for Jacobian computation on H100", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 10-output, 3-input PINN Jacobian on H100, should you use jacfwd or jacrev, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4254", "title": "Autograd graph capture for CUDA graphs on A100 for inference acceleration", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you capture CUDA graphs for GPT-2 XL batch-1 autoregressive decoding with a growing KV cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4255", "title": "Gradient accumulation correctness with DDP and autograd on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "If each of 8 MI300X GPUs has a 2 GB gradient, how much total network traffic does one ring AllReduce move?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4256", "title": "Designing a differentiable data augmentation pipeline for training on H100", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement random crop, color jitter, and cutout so the GAN discriminator loss backpropagates to the generator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4257", "title": "Selective gradient computation using requires_grad and no_grad for fine-tuning", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you freeze the first 20 BERT-large layers to reduce memory while fine-tuning only the last 4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4258", "title": "Implementing double backward for meta-learning with MAML on H100", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement MAML second-order inner-loop gradients without the 8x slowdown and 40 GB memory blowup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4259", "title": "Understanding graph break costs in torch.compile for a custom CUDA extension", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you make the custom attention CUDA op traceable by torch.compile without graph breaks?", "chain_ids": ["cloud-chain-auto-secondary-007-16"], "chain_positions": {"cloud-chain-auto-secondary-007-16": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4260", "title": "Autograd correctness testing with gradcheck and gradgradcheck", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you verify that the differentiable beam search decoder's custom backward pass is analytically correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4261", "title": "MI300X XCD Count and Die Topology", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the actual MI300X chiplet topology, and why is the 192 GB HBM pool not a monolithic die?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4262", "title": "Chiplet Die-to-Die Bandwidth vs HBM Bandwidth", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What chiplet-level bottleneck should you investigate first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4263", "title": "Yield-Performance Tradeoff in Chiplet Design", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What yield and performance arguments should you present?", "chain_ids": ["cloud-chain-auto-secondary-005-04"], "chain_positions": {"cloud-chain-auto-secondary-005-04": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4264", "title": "NUMA Effects in MI300X Multi-XCD Workloads", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix NUMA-related MFU loss on MI300X versus H100?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4265", "title": "Coherency Domains Across XCDs", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What coherency constraints must you enforce across XCDs when sharing intermediate reduction buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4266", "title": "APU Unified Memory and Zero-Copy Data Paths", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the performance implications of using unified memory on the MI300X APU compared to explicit memory copies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4267", "title": "Interposer Bandwidth Scaling Laws", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does aggregate bandwidth scale sublinearly with XCD count in a 4-XCD chiplet system, and what scaling should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4268", "title": "Chiplet vs Monolithic Roofline Model", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does the NUMA bandwidth hierarchy of the MI300X affect kernel tuning versus a monolithic H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4269", "title": "Die-to-Die Link Latency vs NVLink Comparison", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is MI300X intra-package Infinity Fabric equivalent to NVLink 4.0 for collectives, quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4270", "title": "Model Parallelism Partitioning for XCD Locality", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition the 70B model across the 8 XCDs to maximize local memory access?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4271", "title": "HBM3 Stack Partitioning Across MCDs", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How are the 4 Memory Cache Dies (MCDs) in MI300X physically connected to the 8 XCDs, and what does this mean for memory access symmetry?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4272", "title": "Chiplet Architecture and Multi-Tenant Isolation", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural challenges arise from XCD-level multi-tenancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4273", "title": "Power Delivery Across Chiplet Dies", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing non-uniform XCD thermal throttling on MI300X during sustained inference, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4274", "title": "Infinity Fabric Topology and All-Reduce Efficiency", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How does the physical topology constrain the logical ring order?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 3}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4275", "title": "Cross-Die Prefetching for LLM Decode", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you prefetch remote-XCD KV-cache blocks to hide Infinity Fabric latency during MI300X autoregressive decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4276", "title": "Adapter Serving Infrastructure: S-LoRA Paged Memory Design", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design S-LoRA-style paged adapter memory for 1000 LoRA adapters with only 50 active, and what is the overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4277", "title": "LoRA Rank Selection and Memory Budget", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the H100 memory budget for rank-16 LoRA fine-tuning of a 7B model versus full fine-tuning?", "chain_ids": ["cloud-chain-auto-secondary-004-26"], "chain_positions": {"cloud-chain-auto-secondary-004-26": 0}, "chain_tiers": {"cloud-chain-auto-secondary-004-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4278", "title": "QLoRA 4-bit Quantization Memory Arithmetic", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does QLoRA fit 70B fine-tuning on two H100s, and what is the actual GPU memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4279", "title": "Multi-Adapter Batching and Shared Base Model", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you lay out memory and batch requests for 50 rank-16 LoRA adapters sharing one 13B base model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4280", "title": "RLHF Infrastructure: PPO Training Pipeline Architecture", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What four models are needed for PPO RLHF on a 7B policy, and how would you schedule them across 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4281", "title": "Reward Hacking Detection in RLHF Systems", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you diagnose the reward hacking after 3000 PPO steps and what infrastructure-level mitigations would you implement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4282", "title": "LoRA Adapter Merging and Serving Tradeoffs", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When is each strategy optimal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4283", "title": "PEFT Memory Budget Across Parallelism Strategies", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the per-GPU memory breakdown for tensor-parallel LoRA training on 8 GPUs, and what is the binding constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4284", "title": "Adapter-Aware KV Cache Management", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why can't PagedAttention KV cache pages be shared across different LoRA adapters, and how much memory overhead does this create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4285", "title": "RLHF vs RLAIF Infrastructure Cost Comparison", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the cost crossover point where RLAIF becomes significantly more cost-effective than RLHF for 10K-100K preference comparisons?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4286", "title": "Personalization at Scale: Per-User LoRA vs Prompt Engineering", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "At what scale does each approach break down when comparing per-user LoRA adapters vs long system prompt personalization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4287", "title": "Gradient Accumulation in LoRA Training Stability", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the LoRA loss instability and gradient explosions after 500 steps, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4288", "title": "LoRA Rank Sensitivity Analysis", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you select the optimal LoRA rank for the 34B code model, and why do returns diminish beyond rank-64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4289", "title": "Federated LoRA Training for Privacy-Preserving Adaptation", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design federated LoRA training across 50 hospitals with differential privacy guarantees?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4290", "title": "Mixture of LoRA Experts (MoLoRA) Architecture", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design token routing to 8 specialized LoRA adapters, and what are the compute and memory overheads?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4291", "title": "Prefill-Decode Split Rationale", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does disaggregated serving split prefill and decode onto separate GPU pools, and what hardware bottleneck makes sharing inefficient?", "chain_ids": ["cloud-chain-auto-020-04"], "chain_positions": {"cloud-chain-auto-020-04": 0}, "chain_tiers": {"cloud-chain-auto-020-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4292", "title": "KV-Cache Transfer Bandwidth Budget", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the KV-cache transfer size and latency over 400 Gbps InfiniBand, and is it on the critical path for TTFT?", "chain_ids": ["cloud-chain-auto-020-05"], "chain_positions": {"cloud-chain-auto-020-05": 0}, "chain_tiers": {"cloud-chain-auto-020-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4293", "title": "TTFT vs TPOT SLO Tension", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What does moving two decode GPUs to prefill fix and break, and what metric determines the right pool ratio?", "chain_ids": ["cloud-chain-auto-020-04"], "chain_positions": {"cloud-chain-auto-020-04": 1}, "chain_tiers": {"cloud-chain-auto-020-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4294", "title": "Prefill Stall During Decode Migration", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What causes intermittent 4-second P99 TTFT spikes when decode GPUs have high memory utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4295", "title": "Chunked Prefill Scheduling Window", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What chunk scheduling policy would keep TTFT under 200 ms for inputs up to 32k tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4296", "title": "MI300X KV Pool Sizing vs H100", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a 70B FP16 decode pool, how do the two GPUs compare on concurrency and token throughput, and which is better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4297", "title": "Decode Pool Autoscaling Lag", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What leading indicator should trigger decode autoscaling before TPOT degrades, and how much would it reduce SLO violation duration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4298", "title": "Cross-Pool KV Compression Tradeoff", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you compress the KV-cache before prefill-to-decode transfer, and under what conditions is the 2x transfer reduction net-positive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4299", "title": "Speculative Decoding in Disaggregated Architecture", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Where should the 7B draft model run in disaggregated speculative decoding, how should KV caches interact, and what failure modes arise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4300", "title": "Prefill-Decode Affinity and Context Caching", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where should prefix KV-caches live in a disaggregated architecture, how should they be invalidated, and what TTFT savings come from a 60% hit rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4301", "title": "Decode Preemption and KV Swap", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do resumed preempted sequences have 8x TPOT for the first 20 tokens, and how would you fix it?", "chain_ids": ["cloud-chain-auto-020-06"], "chain_positions": {"cloud-chain-auto-020-06": 0}, "chain_tiers": {"cloud-chain-auto-020-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4302", "title": "Optimal Prefill Batch Size for TTFT", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At what sequence length and batch size does the prefill become compute-bound versus memory-bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4303", "title": "Network Topology for KV Transfer at Scale", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "At 500 req/s transferring 384 MB of KV-cache each, is the 400 Gbps 2:1 oversubscribed InfiniBand network a bottleneck, and how would you mitigate it?", "chain_ids": ["cloud-chain-auto-020-05"], "chain_positions": {"cloud-chain-auto-020-05": 1}, "chain_tiers": {"cloud-chain-auto-020-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4304", "title": "Variable Output Length and Decode Pool Imbalance", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does round-robin assignment create such extreme decode memory imbalance, and how would you redesign the system to rebalance work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4305", "title": "TPU v5e as Prefill Accelerator", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is TPU v5e a better prefill accelerator than H100 for a 7B BF16 model, and what practical constraints make the TPU-GPU setup painful?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4306", "title": "Batching Strategy Across Prefill Nodes", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you batch 50-8192-token prefill requests on 8 GPUs to maximize throughput while keeping TTFT variation within 3x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4307", "title": "Graceful Decode Node Failure During Generation", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you recover 200 in-flight sequences after a decode GPU failure, and what state should be checkpointed at what frequency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4308", "title": "Continuous vs. Static Batching in Decode Pool", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does continuous batching improve decode utilization for asynchronous prefill arrivals, and what scheduling event enables it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4309", "title": "Multi-Model Disaggregated Serving Isolation", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect disaggregated serving for 5 LLMs and thousands of tenants while isolating KV caches and capacity across SLOs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4310", "title": "Prefill Throughput Saturation Diagnosis", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the 4-GPU prefill pool stuck at 150k tokens/s with 35% GPU compute and 90% PCIe utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4311", "title": "KV-Cache Radix Tree for Prefix Sharing", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much memory does radix-tree prefix sharing save, and how many concurrent requests fit with versus without sharing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4312", "title": "Tail Latency of Decode Under Mixed Workloads", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 8× P99/P50 TPOT tail on H100 with 30% long outputs, and what priority scheduling intervention would reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4313", "title": "Disaggregation vs. Colocation Break-Even Analysis", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At what request rate does disaggregated serving outperform colocation for the 13B mixed workload on 32 H100s after KV transfer overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4314", "title": "Decode Throughput Scaling with Tensor Parallelism", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do TP=2 H100s and TP=1 MI300X compare for 70B decode latency and throughput, and when does AllReduce become the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4315", "title": "Disaggregated Serving Cost Model", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What H100 prefill to MI300X decode pool ratio minimizes cost at 10,000 req/hr while meeting 200 ms TTFT and 50 ms TPOT?", "chain_ids": ["cloud-chain-auto-020-04"], "chain_positions": {"cloud-chain-auto-020-04": 2}, "chain_tiers": {"cloud-chain-auto-020-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4316", "title": "DLRM Architecture Overview", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the two distinct compute pathways, why are they architecturally separate, and what operation fuses their outputs?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 0}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4317", "title": "TB-Scale Embedding Table Sharding", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the sharding strategy to minimize P99 lookup latency, and what determines which tables go to GPU vs CPU?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 1}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4318", "title": "Multi-Stage Ranking Latency Budget", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you allocate the 150 ms SLO across the 4-stage recommendation pipeline, and which stages benefit most from GPU acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4319", "title": "Embedding Lookup All-to-All Communication", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does embedding all-to-all dominate DLRM training, and what architectural change would reduce communication volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4320", "title": "Online Learning Staleness and Embedding Drift", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is a 10-minute online embedding update cadence insufficient during 30-60 minute viral events despite the model updating on time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4321", "title": "Feature Interaction Layer Arithmetic Intensity", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For batch size 2048, what are the interaction-layer FLOPs and is it compute-bound or memory-bandwidth-bound on an A100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4322", "title": "Embedding Table Update Consistency in Distributed Training", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do high-frequency item embeddings have 3x higher loss, and what optimizer and update-protocol changes would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4323", "title": "Social-Scale Serving QPS and Caching Strategy", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a multi-tier embedding cache, and what hit rate can a 1 TB GPU HBM L2 cache achieve under Zipf α=0.8?", "chain_ids": ["cloud-chain-auto-021-15"], "chain_positions": {"cloud-chain-auto-021-15": 2}, "chain_tiers": {"cloud-chain-auto-021-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4324", "title": "Mixed-Precision Embedding Tables", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Compare FP16, INT8, and INT4 quantization for embeddings: what is the accuracy impact mechanism for each, what is the bandwidth reduction, and which is preferred for the hot/cold split?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4326", "title": "Two-Tower Model vs. DLRM for Retrieval", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which is appropriate for retrieval vs. ranking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4327", "title": "Real-Time Feature Pipeline Latency", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architecture change would cut P99 feature assembly from 45 ms to under 15 ms for the 50 real-time features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4328", "title": "Gradient Accumulation for Sparse Embedding Training", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do sparse gradients OOM during 16-step accumulation on 192 GB GPUs, and how would you fix it without reducing batch or accumulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4329", "title": "Cold Start for New Item Embeddings", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the cold start problem, and what three methods can initialize a new item's DLRM embedding before organic training data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4330", "title": "Model Parallelism Strategy for 100TB Embedding Cluster", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid training strategy and throughput would you expect for 100 TB embeddings and a 10B-parameter MLP on 512 H100s plus CPU nodes?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 3}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4331", "title": "Session-Based Recommendation Temporal Features", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is wrong with concatenating 50 item embeddings into a 3200-dim DLRM input, and what session encoder would you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4332", "title": "Serving Latency vs. Model Freshness Tradeoff", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 100k QPS, how do synchronous online updates compare with 5-minute shadow updates for CTR staleness and serving latency?", "chain_ids": ["cloud-chain-auto-021-16"], "chain_positions": {"cloud-chain-auto-021-16": 2}, "chain_tiers": {"cloud-chain-auto-021-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4333", "title": "Recall @100 vs. NDCG Tradeoff in Retrieval Optimization", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can Recall @100 improve from 72% to 81% while end-to-end NDCG @10 drops from 0.38 to 0.35?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4334", "title": "Embedding Dimension Selection and Capacity", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum item embedding dimension fits in 512 GB for 100M FP32 items, and what dimension would you choose using intrinsic dimensionality?", "chain_ids": ["cloud-chain-auto-021-17"], "chain_positions": {"cloud-chain-auto-021-17": 0}, "chain_tiers": {"cloud-chain-auto-021-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4335", "title": "Request Deduplication and Result Caching in Rec Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a result caching layer for 30-second re-requests, and what GPU load reduction is expected?", "chain_ids": ["cloud-chain-auto-021-15"], "chain_positions": {"cloud-chain-auto-021-15": 1}, "chain_tiers": {"cloud-chain-auto-021-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4336", "title": "DLRM Training on TPU v5e Pod", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which DLRM components map well to a TPU v5e pod, which do not, and what architectural change makes the model TPU-trainable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4337", "title": "Learned Positional Embeddings for Sequence Modeling", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why do positional embeddings improve session next-item prediction, and when can positional information hurt generalization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4338", "title": "Embedding Table Hot Row Replication for Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you replicate the top-10k hot embedding rows across 16 A100 serving nodes, and what read-throughput improvement would that give?", "chain_ids": ["cloud-chain-auto-021-17"], "chain_positions": {"cloud-chain-auto-021-17": 2}, "chain_tiers": {"cloud-chain-auto-021-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4339", "title": "Diversity vs Relevance Tradeoff in Re-ranking", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What system-level mechanism caused the 40% diversity collapse despite +3% CTR, and what re-ranking intervention would recover diversity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4340", "title": "Full-Stack RecSys Architecture for a New Social Platform", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What end-to-end recommendation architecture would you build for 100M users at 500k QPS, and how would it evolve to 1B users at 5M QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4341", "title": "DP-SGD Epsilon Budget Exhaustion at Scale", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are your architectural options to extend the remaining budget, and what are the quantitative tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4342", "title": "Privacy Amplification via Subsampling in Production", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does this claim hold, and what are the implementation pitfalls in a distributed multi-GPU training context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4343", "title": "Federated DP with Heterogeneous Client Noise", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why isn't the DP-FedAvg privacy bound tighter with 10,000 enrolled clients but only about 200 participants per round?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4344", "title": "Membership Inference Attack Resistance Under DP", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is a 61% MIA AUC theoretically possible under an ε=4 DP guarantee, and what vulnerabilities allow it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4345", "title": "Intersectional Fairness Under Distribution Shift", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What caused this and how do you detect and fix it?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 3}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4348", "title": "Multi-Metric Fairness Dashboard for Production LLM", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you build a fairness monitoring pipeline for 500K daily job-description requests, and what compute cost would you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4349", "title": "Model Card Infrastructure for 50-Model Production Fleet", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a system that auto-generates and maintains living model cards for 50 production models under EU AI Act requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4350", "title": "Red-Teaming Throughput vs Coverage Tradeoff", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the 500 person-hour red-team budget be allocated across free-form, taxonomy-guided, and automated adversarial testing?", "chain_ids": ["cloud-chain-auto-secondary-015-37"], "chain_positions": {"cloud-chain-auto-secondary-015-37": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-37": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4351", "title": "RLHF Reward Hacking and Constitutional AI Safeguards", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you fix the RLHF reward-hacking pattern at the training, inference, and monitoring layers?", "chain_ids": ["cloud-chain-auto-secondary-015-37"], "chain_positions": {"cloud-chain-auto-secondary-015-37": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-37": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4352", "title": "Operational vs Embodied Carbon in a 1,000-GPU Training Cluster", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the operational and embodied carbon emissions for the H100 and A100 training options, and which is more carbon-efficient over a 3-year hardware lifecycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4353", "title": "Carbon-Aware Training Job Scheduling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule a continuous 12-hour, 200-GPU training job within a 48-hour window to minimize carbon while meeting the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4354", "title": "Lifecycle Carbon Analysis of a 3B Model Serving System", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the lifecycle carbon per inference for the 3B LLM over 2 years, and which component dominates emissions?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 4}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4355", "title": "Carbon Cost Per Inference: INT8 vs FP16 Serving Comparison", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the carbon per 1M tokens for FP16 versus INT8 13B inference, and how much annual carbon does INT8 save at 50B tokens/day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4356", "title": "Spatial Carbon Arbitrage: Multi-Region Training Routing", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do the carbon and performance tradeoffs compare for all US-East, all EU-North, and split US-West/EU-North routing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4357", "title": "Carbon-Aware Autoscaling for Inference Endpoints", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign spot-instance autoscaling to use real-time carbon intensity while meeting the 6-hour SLA for 99% of transcription jobs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4358", "title": "SustainabilityML: Reporting Carbon per Training Run", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the IT energy, facility energy, market-based emissions, and location-based emissions (assuming a regional grid of 350 gCO2e/kWh) for the training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4359", "title": "MI300X Roofline vs H100: Memory-Bound vs Compute-Bound Boundary", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the MI300X and H100 roofline ridge points, and which has higher LLaMA-2 70B decode throughput at AI=0.8 FLOP/byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4360", "title": "MI300X 192GB vs H100 80GB: Multi-Model Serving Density", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Can a single 192GB or 80GB accelerator host 8 concurrent 13B FP16 replicas plus KV cache for 512 requests, and what is the replica density advantage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4361", "title": "TPU v5e vs H100: Systolic Array Efficiency for Transformer Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At AI≈100 FLOP/byte for 7B transformer training, which accelerator delivers higher effective throughput and MFU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4362", "title": "MI300X XCD Architecture and Memory Locality", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does splitting the kernel across all 8 MI300X XCDs reduce bandwidth from 4.2 TB/s to 3.1 TB/s, and how should memory locality be handled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4363", "title": "Serving a 70B Model: MI300X Single-Card vs H100 Tensor Parallel", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 70B LLM at 200 req/s and p99 <200 ms, how do MI300X PCIe TP=2 and H100 NVLink TP=2 compare, and what should you deploy?", "chain_ids": ["cloud-chain-auto-001-19"], "chain_positions": {"cloud-chain-auto-001-19": 1}, "chain_tiers": {"cloud-chain-auto-001-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4364", "title": "TPU v5e ICI Topology vs H100 NVLink for Data Parallelism", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using ring AllReduce for a 12GB gradient on 256 accelerators, what are the theoretical times on TPU v5e ICI versus H100 over InfiniBand?", "chain_ids": ["cloud-chain-auto-002-09"], "chain_positions": {"cloud-chain-auto-002-09": 1}, "chain_tiers": {"cloud-chain-auto-002-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4365", "title": "MI300X INT4 Quantization: VRAM and Throughput vs H100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 70B INT4 GPTQ model, what token throughput should H100 and MI300X achieve, and does MI300X keep its bandwidth advantage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4366", "title": "TPU v5e BF16 Accumulation and Training Stability vs H100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the TPU v5e BF16 run 1.8% higher loss after 50K steps than the H100 FP16 baseline, and how would you fix it on TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4367", "title": "MI300X Power Budget and Thermal Throttling Under Full Load", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the MI300X thermally limited at 92°C and 742W under sustained GEMM, and how does its thermal headroom compare with H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4368", "title": "ROCm vs CUDA Ecosystem Overhead for MI300X Deployment", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much of the 85% MI300X vLLM throughput is a ROCm kernel gap versus hardware capability, and what is expected with FlashAttention on ROCm?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4369", "title": "MI300X MIG vs H100 MIG: Multi-Tenant Serving Partitioning", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you serve 14 tenant-isolated 7GB models on H100 MIG versus MI300X, and what MI300X multi-tenant design would you use?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 2}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4370", "title": "TPU v5e Pod AllReduce Topology for 1T Model Training", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you map TP=8, PP=16, and EP=16 for the 1T MoE model onto a 16×16×8 TPU torus, and how does it compare to H100?", "chain_ids": ["cloud-chain-auto-002-09"], "chain_positions": {"cloud-chain-auto-002-09": 2}, "chain_tiers": {"cloud-chain-auto-002-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4371", "title": "MI300X Unified Memory for CPU-GPU Tensor Offloading vs H100 PCIe", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 180GB model with a 50GB working set and 130GB KV cache, how does PCIe offload compare with unified memory for cold cache fetch latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4372", "title": "TPU v5e vs H100 for Fine-Tuning LoRA at Scale", "topic": "quantization-fundamentals", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which configuration finishes faster, and what is the per-dollar cost comparison assuming $1.2/chip-hr for TPU v5e and $3.5/GPU-hr for H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4373", "title": "MI300X vs A100: VRAM Capacity Advantage for Long-Context Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "For the 13B model at 128K context, how large is one request's KV cache, and how many requests fit on 80GB vs 192GB GPUs?", "chain_ids": ["cloud-chain-auto-008-19"], "chain_positions": {"cloud-chain-auto-008-19": 1}, "chain_tiers": {"cloud-chain-auto-008-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4374", "title": "MI300X Prefill Throughput: Compute-Bound vs H100 for Long Prompts", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which accelerator is compute-bound for prefill at this AI, and what are the expected FLOP efficiencies between A100, H100, and MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4375", "title": "TPU v5e vs MI300X: Serving Cost Per Token at Scale", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For serving the 7B LLM at 100B tokens/day, how many small chips or large GPUs are needed and what is the cost per 1M tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4376", "title": "MI300X AllReduce Performance in 8-GPU Data Parallel Training vs H100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 28GB gradient AllReduce on 8 GPUs, what ring-AllReduce time do MI300X xGMI and H100 NVLink achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4377", "title": "MI300X for Mixture-of-Experts: Expert Capacity and Memory Layout", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What serving configuration should you use for Mixtral 8×7B on H100 versus MI300X, and what decode throughput should each deliver?", "chain_ids": ["cloud-chain-auto-001-20"], "chain_positions": {"cloud-chain-auto-001-20": 1}, "chain_tiers": {"cloud-chain-auto-001-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4378", "title": "Vendor Lock-In Analysis: TPU v5e vs MI300X vs H100 for a 5-Year Infrastructure Plan", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator should be the primary platform for a 5-year 7B–100B training and serving commitment considering software ecosystem, hardware upgrades, and TCO, and what is your hedge strategy?", "chain_ids": ["cloud-chain-auto-001-19"], "chain_positions": {"cloud-chain-auto-001-19": 2}, "chain_tiers": {"cloud-chain-auto-001-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4379", "title": "Disparate Impact Testing for LLM Embedding Spaces", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "If you select the top 20% of 10,000 candidates, what are the Group A and B selection rates and does the embedding system violate the 4/5ths rule?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4380", "title": "MI300X HBM3 Bandwidth as a Power Efficiency Lever", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the MI300X and H100 SXM5 peak HBM bandwidths, and how does higher bandwidth affect accelerator count and rack power for inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4381", "title": "PUE Decomposition for Dense H100 Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For the 1,000-node cluster at PUE 1.4, what is the total facility power, non-IT overhead, and which PUE components dominate compared to traditional compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4382", "title": "Stranded Power in Mixed H100/A100 Rack Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you fill a 25 kW rack with 8 kW H100 and 5.2 kW A100 nodes to maximize GPUs, and how much power is stranded?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4383", "title": "Thermal Throttling Cascade in a Dense H100 Pod", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing Pods 7-12 to drop from 700W to 500W with 38°C hot aisles, and what immediate and long-term remediation should you take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4384", "title": "Power Capping Strategy for MI300X Training Under Budget Constraints", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do static 590W caps, 25% batch-size reduction, and dynamic power capping affect throughput and energy efficiency under the 300 kW cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4385", "title": "Waste Heat Recovery Feasibility for H100 Liquid-Cooled Racks", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can 55°C D2C return coolant from 500 H100 nodes be used for 60°C district heating, what options bridge the gap, and how much heat is recoverable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4386", "title": "GPU Power State Transitions and Idle Power Optimization", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should H100 P-states and persistence mode be configured to reduce idle power, and what savings are possible for 200 GPUs at 40% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4387", "title": "MI300X vs H100: Energy-per-Token at Scale for LLM Inference", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 100B tokens/day of Llama-3 70B FP16 inference, which platform has lower energy per token after sharding and bandwidth effects?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4388", "title": "Dynamic Voltage and Frequency Scaling for H100 Training Efficiency", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What cluster-level DVFS policy would smooth training power under the 25 MW budget while exploiting GEMM and all-reduce phase differences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4389", "title": "Power Distribution Unit Over-Subscription in GPU Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a 30 kW continuous, 36 kW peak PDU handle three 8 kW H100 nodes ramping simultaneously, and what soft-start policy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4390", "title": "Computational Carbon Intensity of MI300X Fine-Tuning Runs", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the carbon footprint of the 64×MI300X fine-tuning job in the Pacific Northwest versus the Southeast, and how should it be reported?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4391", "title": "Cooling Fluid Routing for Hot-Aisle Containment in H100 Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does hot-aisle containment improve PUE for 8 kW nodes, what thermodynamic mechanism matters, and what PUE gain is typical?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4392", "title": "Server Power Supply Efficiency Curves and 80 PLUS Certification Impact", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the 80 PLUS Gold, Platinum, and Titanium efficiencies at 50% load, and how much does Titanium save versus Gold for one 8×H100 server drawing 8 kW IT load at 50% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4393", "title": "Workload Consolidation to Improve GPU Utilization and Power Efficiency", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much power would consolidating 1,000 underutilized GPUs save, what risks does it create, and how would MIG or time-slicing improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4394", "title": "Transformer Inference Memory Bandwidth Saturation and Power Envelope", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is energy per token lower at batch 64 than batch 1, and what batch size minimizes energy while meeting a 200 ms TTFT SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4397", "title": "MI300X OAM Module Thermal Interface and Cooling Validation", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Implement a monitoring policy that prevents thermal throttling, identify the thermal margin remaining, and calculate the maximum safe utilization given your ambient conditions.", "chain_ids": ["cloud-chain-auto-021-03"], "chain_positions": {"cloud-chain-auto-021-03": 2}, "chain_tiers": {"cloud-chain-auto-021-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4398", "title": "Power Anomaly Detection for H100 Training Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might h100-rack07-node04 sustain 820W per GPU without errors, is it a fault, and what power thresholds should trigger escalation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4399", "title": "Fleet-Wide GPU Energy Efficiency Benchmarking with Performance-per-Watt", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a single energy-efficiency score for a heterogeneous inference fleet, and what would you recommend?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4400", "title": "MI300X Unified Memory Architecture and Its Thermal Implications", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why did the MI300X serving process jump to 5.1 TB/s bandwidth and 740W, what are the thermal consequences, and what workload change caused it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4401", "title": "Carbon-Aware Inference Routing Across Multi-Region H100 Fleets", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What carbon-aware routing policy would you use for the 18,000 q/h service, and how much Scope 2 carbon would it save under a 200 ms p99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4402", "title": "Scope 3 Embodied Carbon Accounting for MI300X Cluster Procurement", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For the 512-GPU cluster, how do Scope 2 and Scope 3 lifecycle emissions compare, and when does embodied carbon exceed operational carbon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4404", "title": "H100 Inference SLA Degradation Under Noisy Neighbor CPU Contention", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the p99 latency spike to 380 ms with unchanged p50 and GPU utilization, and how would you mitigate it immediately and systemically?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4405", "title": "Disaggregated Prefill-Decode Architecture for Tail Latency Control", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a disaggregated prefill/decode architecture that achieves p99 TTFT < 500ms while maintaining end-to-end p99 generation latency < 4 seconds for a 500-token output, including the KV cache transfer cost?", "chain_ids": ["cloud-chain-auto-025-17"], "chain_positions": {"cloud-chain-auto-025-17": 2}, "chain_tiers": {"cloud-chain-auto-025-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4406", "title": "RDMA Write Throughput Saturation in MI300X AllReduce Rings", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 512-GPU ring AllReduce only achieving 140 Gbps with 35% RDMA retransmits, and what network tuning would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4407", "title": "RDMA Queue Pair Limits and Scalability in Large H100 Pods", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "At 512-1,024 nodes, why does NCCL initialization hang from RDMA QP exhaustion, and what scaling limits and architectural fixes would you propose?", "chain_ids": ["cloud-chain-auto-020-03"], "chain_positions": {"cloud-chain-auto-020-03": 0}, "chain_tiers": {"cloud-chain-auto-020-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4408", "title": "RDMA Memory Registration Overhead in Dynamic Batch LLM Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does dynamic tensor allocation add 310ms of ibv_reg_mr overhead per training step, and how would you redesign RDMA memory management?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4409", "title": "UCIe Bandwidth Scaling for Multi-Chiplet AI Accelerators", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What UCIe bandwidth constraint prevents 4 TB/s compute-to-HBM bandwidth, and what chiplet floorplan would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4410", "title": "MI300X NUMA-Aware Tensor Parallel Rank Placement", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you fix the rank placement to resolve the 18% all-reduce latency penalty on MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4411", "title": "Chiplet Yield Model and Cost per Compute Die", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the 600 mm² monolithic die versus six 100 mm² chiplets, what are the yield and cost differences, and where is the crossover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4412", "title": "HBM4 Stack Integration Timing for Next-Gen AI Die", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the 24-month training accelerator roadmap choose HBM3E now or wait for HBM4, given the bandwidth-compute balance and schedule risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4413", "title": "Coherency Protocol Overhead Across XCD Boundaries", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does XLA fusion slow down 40% when tensors are on different XCDs, and how would you fix the data placement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4414", "title": "Active Interposer vs Passive Silicon Interposer Tradeoffs", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use an active interposer or a passive silicon interposer for the next AI accelerator, and what full-system tradeoffs determine the choice?", "chain_ids": ["cloud-chain-auto-secondary-005-04"], "chain_positions": {"cloud-chain-auto-secondary-005-04": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4415", "title": "Chiplet-Based GPU Multi-Instance Partitioning", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design strong multi-tenant isolation for MI300X by mapping tenant slices onto its XCD topology?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4416", "title": "Infinity Fabric vs NVLink for Scale-Up Topology", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 70B LLM training with TP=8, how do 8x NVLink and 8x Infinity Fabric nodes compare on all-reduce and memory capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4417", "title": "XCD Hot-Spot Thermal Management Under Sustained Load", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do XCDs 0 and 1 throttle and reduce throughput by 12% despite uniform TDP and temperatures, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4418", "title": "Disaggregated Memory Architecture: CXL vs HBM on Chiplet", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 70B LLM inference, what are the bandwidth and latency tradeoffs of replacing some HBM stacks with CXL 3.0 DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4420", "title": "Multi-Die Power Delivery Network Design for AI Chiplet", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the PDN for the 4-compute-die, 2-HBM chiplet package to reduce matrix-multiply voltage droop from 8% to under 3%?", "chain_ids": ["cloud-chain-auto-secondary-005-04"], "chain_positions": {"cloud-chain-auto-secondary-005-04": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4421", "title": "Chiplet Roofline Model for Mixed Precision LLM Training", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At 120 FLOP/byte arithmetic intensity, are MI300X and H100 BF16 transformer forward passes bandwidth-bound or compute-bound, and what throughput do you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4422", "title": "Chiplet Die-to-Die Latency Impact on Pipeline Bubble Rate", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If a forward pass crosses boundaries 3 times at 180 ns each, what causes the 7% throughput drop, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4423", "title": "Heterogeneous Chiplet Integration: DSP + AI Core on Same Package", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the data path and memory sharing between the DSP die and AI inference die for FFT-heavy DNN inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4424", "title": "LoRA Rank Sensitivity to Task Complexity", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does LoRA rank 4 underperform while rank 64 gives no gain over rank 16 for SQL generation, and which rank should you choose?", "chain_ids": ["cloud-chain-auto-secondary-004-26"], "chain_positions": {"cloud-chain-auto-secondary-004-26": 1}, "chain_tiers": {"cloud-chain-auto-secondary-004-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4425", "title": "QLoRA Double Quantization Memory Accounting", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory does QLoRA need to fine-tune a 70B model with NF4 and double quantization on one 80GB GPU, and is it feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4426", "title": "Multi-Adapter Hot-Swap Serving with S-LoRA", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you load and cache 500 LoRA adapters for the 4xH100 serving system to keep P99 TTFT below 100 ms at 200 QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4427", "title": "RLHF Reward Model Overoptimization Detection", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why are reward scores rising while human ratings fall with KL at 12 nats after 3 PPO epochs, and how would you fix the reward hacking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4428", "title": "Adapter Composition: Sequential vs Parallel LoRA Merging", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For medical and formal-tone LoRA adapters applied together, should you use sequential composition, parallel merge, or task-vector arithmetic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4429", "title": "PEFT Strategy Selection for 100B+ Models at Cloud Scale", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a 140B model and 10K examples, how do LoRA, (IA)^3, and prefix tuning compare on memory, performance, and training time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4430", "title": "LoRA Target Module Selection: Q/K/V/O vs All Linear Layers", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For Mistral 7B classification, how many LoRA parameters do Q+V-only versus all-linear targeting add, and what are the performance implications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4431", "title": "Reward Model Architecture for RLHF at Scale", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What reward model size, architecture, and serving strategy would you use to score PPO completions at 500 QPS on a 4x GPU budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4432", "title": "Continual Adaptation: Catastrophic Forgetting in LoRA Updates", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did original-task performance degrade 15% after four monthly LoRA adapter updates even though the base weights were frozen, and how would you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4433", "title": "PPO Clip Ratio Tuning for RLHF Stability", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is causing PPO reward collapses every 200 steps with high value loss and epsilon 0.2, and what tuning changes would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4434", "title": "Multi-Task LoRA Training with Gradient Conflict Resolution", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you diagnose and resolve gradient conflict when code generation loss rises after step 500 while summarization and QA improve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4435", "title": "DPO vs PPO Infrastructure Cost for Alignment at 70B Scale", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "With 32 H100s for 1 week and 200K preference pairs, should you choose DPO or PPO to align the 70B model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4436", "title": "Adapter Versioning and Rollback in Production ML Systems", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design adapter versioning and rollback so a bad LoRA adapter v23 can revert to v22 within 5 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4437", "title": "LoRA Adapter Distillation for Latency-Sensitive Serving", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you distill the r=16 domain LoRA adapter for a 70B model to meet a P95 TTFT target below 50 ms while preserving quality?", "chain_ids": ["cloud-chain-auto-secondary-004-26"], "chain_positions": {"cloud-chain-auto-secondary-004-26": 2}, "chain_tiers": {"cloud-chain-auto-secondary-004-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4438", "title": "LoRA Training Data Efficiency: Minimum Viable Dataset Size", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is 500 examples sufficient for LoRA fine-tuning, or is the colleague right about needing 10K?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4439", "title": "Prefill Pool GPU Count Formula for TTFT SLO", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many H100s do you need in the prefill pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4440", "title": "KV-Cache Transfer Compression to Reduce Network Bottleneck", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV-cache compression scheme would reduce the 3.2 ms prefill-to-decode transfer over 400 Gbps InfiniBand and lower TTFT overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4441", "title": "Decode Pool Scaling Policy Under Variable Output Length", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule the decode pool to prevent 2000+ token long-tail requests from spiking P99 TTOT?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4442", "title": "Fault Tolerance for In-Flight Requests During Decode Node Failure", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you recover the 200 active generations after a decode node failure without restarting them from scratch?", "chain_ids": ["cloud-chain-auto-020-06"], "chain_positions": {"cloud-chain-auto-020-06": 1}, "chain_tiers": {"cloud-chain-auto-020-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4443", "title": "Prefill-Decode Ratio Optimization for LLM Serving Fleet", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What optimal prefill-to-decode GPU ratio would you choose to maximize GPU utilization for a 70B model with 256-token prompts and 512-token outputs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4444", "title": "Chunked Prefill Optimal Chunk Size Selection", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What chunk size would you choose for 2048-token prompts, and how does it trade TTFT against decode TPOT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4445", "title": "KV Cache Network Topology for High-Bandwidth Transfer", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much P99 latency would 800 Gbps NDR save over 400 Gbps HDR for KV transfer, and what alternative would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4446", "title": "Disaggregated Serving Autoscaler Response Time Analysis", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you autoscale decode capacity for 2x traffic spikes lasting 3 minutes when new nodes take 90 seconds to provision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4447", "title": "Context Caching with Disaggregated KV Stores", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a distributed KV cache for the common 1024-token system prompt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4448", "title": "MI300X vs H100 Decode Pool Economics", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which GPU would you choose for the 70B decode pool, H100 SXM5 or MI300X, based on cost per token and memory capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4449", "title": "SLA-Aware Request Routing Between Prefill and Decode Pools", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you route premium and standard requests across 10 prefill GPUs to meet 100ms and 500ms P99 TTFT SLAs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4450", "title": "Speculative Decoding in Disaggregated Architecture: Draft Pool Design", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the request flow, acceptance sampling, and resource sizing for 7B-draft/70B-target speculative decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4451", "title": "Disaggregated Serving Observability: Key Metrics Dashboard", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which operational metrics should an SRE watch to distinguish prefill-pool, KV-transfer, and decode-pool problems?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4452", "title": "Prefill Batch Composition for Throughput Maximization", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you batch prompts from 32 to 4096 tokens to reduce the 60% padding waste?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4453", "title": "KV Cache Eviction Policy for Decode Pool Memory Management", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do >4K-token arrivals trigger vLLM preemption cascades at 95% KV utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4454", "title": "Disaggregated Serving Cost Model: Build vs Buy", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the startup build a disaggregated Llama-3 70B system or buy a managed API, and what is the break-even volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4455", "title": "Multi-Tenant KV Cache Isolation in Disaggregated Serving", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you isolate KV cache usage so tenant A's 10K-request burst cannot preempt tenant B's requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4456", "title": "Decode Straggler Detection in Large Batch Disaggregated Serving", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is likely causing 1% of decode requests to see 120ms/token while the median is 40ms/token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4457", "title": "Disaggregated Serving with TPU v5e Prefill Nodes", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is TPU v5e a good choice for 70B prefill nodes versus H100, and what integration issues would you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4458", "title": "Request Migration Between Decode Nodes for Load Balancing", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you live-migrate decode requests from nodes 1-3 to nodes 4-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4459", "title": "Decode Pool Tensor Parallelism Degree Selection", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do TP=1 MI300X, TP=2 H100, and TP=4 H100 compare for 70B decode throughput, TPOT, and cost per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4460", "title": "Continuous Batching Micro-Batch Scheduling for TPOT Fairness", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What fairness and TPOT impact does adding 15 new requests to a 90-request continuous decode batch have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4461", "title": "Disaggregated Serving Tail Latency Root Cause Framework", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically diagnose P99 TTFT of 800ms when P50 is 120ms and utilization, queues, and bandwidth look normal?", "chain_ids": ["cloud-chain-auto-020-05"], "chain_positions": {"cloud-chain-auto-020-05": 2}, "chain_tiers": {"cloud-chain-auto-020-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4462", "title": "Prefill Warmup and JIT Compilation Latency at Service Start", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should the deployment readiness lifecycle prevent traffic routing before JIT compilation and CUDA graph warmup complete?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4463", "title": "Disaggregated Serving Graceful Degradation Under Partial Failure", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you gracefully degrade service after losing 30% of decode capacity while serving as many users as possible?", "chain_ids": ["cloud-chain-auto-020-06"], "chain_positions": {"cloud-chain-auto-020-06": 2}, "chain_tiers": {"cloud-chain-auto-020-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4464", "title": "Embedding Table Sharding Strategy for 10TB Feature Space", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you shard 50 embedding tables totaling 10TB across 128 GPUs, including the 512GB largest table?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4465", "title": "Online Learning Embedding Staleness and Cache Invalidation", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do 15% of requests use stale embeddings after updates, and how would you fix the cache invalidation mechanism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4466", "title": "Multi-Stage Ranking Latency Budget Decomposition", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you allocate a 100ms recommendation latency budget across retrieval, light ranking, and heavy ranking?", "chain_ids": ["cloud-chain-auto-021-16"], "chain_positions": {"cloud-chain-auto-021-16": 1}, "chain_tiers": {"cloud-chain-auto-021-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4467", "title": "Feature Freshness vs Serving Latency: Pre-Compute vs Real-Time", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which of the 200 features should be pre-computed versus computed at request time, given 50 real-time and 150 hourly features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4468", "title": "Social Graph Embedding Update Frequency for Friend-Aware Rec", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you incorporate friend activity for 1B users at 10K QPS while keeping P99 latency under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4470", "title": "Embedding Dimension vs Model Capacity Tradeoff in RecSys", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the memory and performance tradeoffs of increasing item embeddings from 64 to 256 dimensions for 10M items?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4471", "title": "Real-Time Embedding Table Update Consistency Under Training", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What consistency model would you use when training publishes embedding updates every minute but serving may lag?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4472", "title": "ANN Index Rebuild Latency for Embedding Table Updates", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you architect the system to cut the daily 10M-item HNSW index rebuild from 4 hours to under 30 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4473", "title": "Embedding Gradient Sparsity and Optimizer Choice", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is Adam slowing DLRM training on embedding tables, and which optimizer would you use for embeddings versus dense layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4474", "title": "Two-Tower Model vs Cross-Attention Ranker for Serving Cost", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For ranking 1M candidates at 5K QPS, would you serve retrieval with a two-tower model or a cross-encoder, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4475", "title": "All-to-All Communication Optimization for Distributed DLRM", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you reduce embedding all-to-all communication on 128 GPUs from 40% of DLRM training time to under 10%?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 2}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4476", "title": "RecSys Cold Start: New Item Embedding Initialization", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you fix the cold-start embedding initialization to prevent new items from losing 3x impressions during their first 7 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4477", "title": "DLRM Training on TPU v5e Pod: Embedding Table Placement", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you place 8TB of embedding tables on a 256-chip TPU pod with only 4TB total HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4478", "title": "Request Deduplication in High-Frequency Recommendation Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a deduplication layer for 100K QPS when 15% of requests are exact duplicates within 100ms?", "chain_ids": ["cloud-chain-auto-021-15"], "chain_positions": {"cloud-chain-auto-021-15": 0}, "chain_tiers": {"cloud-chain-auto-021-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4479", "title": "Exposure Bias and Position Debiasing in Ranking Models", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and correct position bias when positions 1-3 have 5x higher CTR than positions 10-15?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4480", "title": "Sequence Model for Session-Based Recommendation Serving Latency", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is a 15ms Transformer session encoder justified over 2ms session features, and what serving architecture would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4481", "title": "Distributed Training Throughput for 100TB Embedding Cluster", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the distributed training architecture and verify whether 1024 GPUs can process 1T samples in under 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4482", "title": "Serving Diversity vs Relevance Tradeoff in Re-ranking", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a re-ranking algorithm that balances relevance and diversity when 18 of the top 20 items are fashion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4483", "title": "Real-Time Feature Freshness vs Training Distribution Shift", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and fix the train-serve skew from training on hourly batch features but serving second-level aggregates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4484", "title": "Precision vs Recall Tradeoff in Multi-Stage Retrieval Pipeline", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is this acceptable?", "chain_ids": ["cloud-chain-auto-021-16"], "chain_positions": {"cloud-chain-auto-021-16": 0}, "chain_tiers": {"cloud-chain-auto-021-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4485", "title": "Full-Stack Personalization Infrastructure: Greenfield Design", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the first 18 months of recommendations to handle 10x yearly growth without over-engineering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4486", "title": "Embedding Table Memory Bandwidth Optimization with Mixed Precision", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you evaluate FP32, FP16, and INT8 row-wise scaling for DLRM embedding tables, and what precision mix would you choose?", "chain_ids": ["cloud-chain-auto-021-17"], "chain_positions": {"cloud-chain-auto-021-17": 1}, "chain_tiers": {"cloud-chain-auto-021-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4487", "title": "Hot Row Replication Strategy for Embedding Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you replicate the top 1000 hot rows across 16 GPUs while keeping embedding updates consistent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4488", "title": "Recall Metric Selection for Production Retrieval Evaluation", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do Recall @K, NDCG @K, and Hit Rate @K compare for evaluating the retrieval stage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4492", "title": "MI300X MoE Routing Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate an expert routing mechanism that minimizes cross-node traffic over the 400 Gbps bottleneck while maintaining load balance?", "visual": {"kind": "svg", "path": "cloud-4492.svg", "alt": "Diagram comparing local massive HBM bandwidth versus thin network link between MI300X nodes.", "caption": "MoE Token Routing Network"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4495", "title": "Hierarchical Parallelism Placement on NVSwitch", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Map an 8-way Tensor Parallelism (TP) and 8-way Data Parallelism (DP) strategy to the hardware topology to minimize slow inter-node transfers.", "visual": {"kind": "svg", "path": "cloud-4495.svg", "alt": "Topology diagram showing dense intra-node connections and sparse inter-node connections.", "caption": "NVSwitch vs IB Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4498", "title": "Data Pipeline Throughput Matching", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Determine the minimum CPU throughput in images/second required to prevent the GPU from stalling on data loading.", "visual": {"kind": "svg", "path": "cloud-4498.svg", "alt": "Bar chart comparing throughput stages: Disk IO, CPU Augmentation, PCIe Transfer, and GPU Compute.", "caption": "Throughput Bottleneck Stages"}, "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 2}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4499", "title": "Torus vs Fat-Tree for AllToAll Workloads", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which topology inherently supports a higher bisection bandwidth for worst-case AllToAll traffic, and why?", "visual": {"kind": "svg", "path": "cloud-4499.svg", "alt": "Diagram comparing a layered tree network with a grid-like torus network.", "caption": "Fat-Tree vs Torus"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4501", "title": "LLM Admission Control for Tail Latency", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a queueing-aware admission control policy that limits tail latency while keeping the GPUs saturated.", "visual": {"kind": "svg", "path": "cloud-4501.svg", "alt": "A graph showing an uncontrolled queue growing exponentially versus an admission-controlled queue flattening out.", "caption": "Controlled vs Uncontrolled Queue"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4504", "title": "PCIe Bottleneck in High-Res Image Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the potential bottleneck in transferring uncompressed FP32 4K image tensors over PCIe Gen5 to feed the H100 GPUs.", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 3}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4505", "title": "Dynamic MIG Autoscaling on A100", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a scaling strategy using Multi-Instance GPU (MIG) to effectively duty-cycle compute during low-traffic periods without node power-downs.", "chain_ids": ["cloud-chain-auto-secondary-017-37"], "chain_positions": {"cloud-chain-auto-secondary-017-37": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-37": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4506", "title": "Block-wise FP8 KV Cache Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a block-wise FP8 quantization scheme for the KV cache to maximize sequence length while avoiding attention outlier degradation.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4507", "title": "LLM Checkpoint Bandwidth Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the NFS aggregate write bandwidth required to checkpoint the model and optimizer states within 30 seconds.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4509", "title": "Distributed Vision Dataloader Design", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a data loading pipeline that eliminates GPU starvation and manages network and decoding bottlenecks.", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 5}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4510", "title": "LLM Checkpoint Storage Sizing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify the exact total gigabytes required to save a single full training checkpoint containing all necessary parameters and optimizer states.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4511", "title": "Asymmetric INT8 Quantization Kernels", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Derive the basic mathematical formulation to convert FP16 weights to INT8, and explain how they are dequantized during the GEMM.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4512", "title": "API Gateway Queueing Model", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Select the appropriate Kendall notation queueing model for this setup and calculate the overall system utilization.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4513", "title": "GPU Shared Memory Tiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "To maximize tensor core throughput, which specific level of the GPU memory hierarchy must the matrix be tiled into, and what is its strict hardware limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4514", "title": "Diurnal Workload Power Scaling", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate a dynamic power-management strategy that minimizes energy waste during off-peak hours?", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 3}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4515", "title": "Dataloader Thread Blocking", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the mechanical reason for the GPU underutilization and how does changing this parameter fix it?", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 1}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4516", "title": "LLM Queueing Wait Time", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Construct the M/M/1 wait time estimate and demonstrate how reducing service time variance via iteration-level scheduling lowers it.", "visual": {"kind": "svg", "path": "cloud-4516.svg", "alt": "Hockey-stick curves comparing M/M/1 and M/D/1 wait times as utilization approaches 1.0.", "caption": "Queue wait times spike non-linearly at high utilization."}, "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 4}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4518", "title": "H100 Data Loading Pipeline Creation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the data pipeline throughput requirements for storage read, CPU decoding, and PCIe Gen5 transfer to prevent GPU starvation.", "visual": {"kind": "svg", "path": "cloud-4518.svg", "alt": "Horizontal bar chart showing throughput jumps from Storage to CPU to PCIe to GPU.", "caption": "H100 Computer Vision Pipeline Bandwidth Stages."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4523", "title": "MI300X Rail-Optimized MoE", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the grouping for Expert Parallelism to minimize inter-node traffic bottlenecks.", "visual": {"kind": "svg", "path": "cloud-4523.svg", "alt": "Topology placement showing 8 nodes connected vertically by rail switches.", "caption": "Rail-optimized topology mapping for 8 nodes."}, "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 3}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4524", "title": "H100 Parquet Prefetch Sizing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Determine the minimum prefetch buffer size per node to hide the S3 network latency entirely.", "visual": {"kind": "svg", "path": "cloud-4524.svg", "alt": "Throughput stages showing Network Latency gap hidden by Prefetch Buffer.", "caption": "Latency hiding via prefetch buffering."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4526", "title": "Llama-3 70B KV Cache Sizing", "topic": "kv-cache-management", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Compute the total memory allocated for the KV cache.", "visual": {"kind": "svg", "path": "cloud-4526.svg", "alt": "Bar chart showing KV cache size exceeding total H100 memory.", "caption": "KV cache memory demand vs single GPU capacity."}, "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 0}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4528", "title": "H100 JSONL Decompression Bound", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the throughput bottleneck and state the maximum processing rate of the pipeline.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4529", "title": "Multi-Modal Distributed Ingestion Architecture Specification", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Architect the data loader and prefetch specification to ensure the 989 TFLOP/s FP16 tensor cores remain at least 60% utilized without bottlenecking on host-device PCIe bandwidth.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4530", "title": "H100 Burst Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-offs between deep sleep states and idle power overhead when managing these intermittent traffic bursts.", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4533", "title": "MI300X Huge Embedding Placement", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should the 1TB embedding tables be partitioned across the host CPU memory and the MI300X's 192 GB HBM3 to optimize training speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4534", "title": "H100 Distributed Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a distributed checkpointing strategy to minimize the time spent stalling the GPUs while saving model state.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4536", "title": "MoE Tiered Caching", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a tiered caching strategy that minimizes tail latency for expert retrieval while adhering to HBM bandwidth and PCIe Gen5 limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4539", "title": "W8A16 KV Cache Expansion", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the W8A16 weight footprint, then compute the maximum sustainable batch size given the 32-user, 4096-token KV cache requirement, and determine if W8A16 is sufficient.", "validated": true, "math_verified": true, "human_reviewed": {"status": "verified", "by": "expert", "date": "2026-04-28"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4540", "title": "H100 Budget Feasibility", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you derive the required duration and prove whether this budget is sufficient given the GPU's theoretical FLOPs and an estimated 40% Model Flops Utilization?", "chain_ids": ["cloud-chain-auto-secondary-015-06"], "chain_positions": {"cloud-chain-auto-secondary-015-06": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4541", "title": "MI300X DVFS Latency Penalty", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how reducing the GPU clock frequency via DVFS affects both the dynamic power consumption and the latency of individual inference requests.", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4543", "title": "H100 UVM Graph Streaming", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify a hardware-aware memory strategy utilizing the memory hierarchy to train this model efficiently without running out of HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4545", "title": "Memory Bandwidth Limits of Large Model Generation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the minimum quantization bit-width required to fit the model, and what is the theoretical minimum token generation latency for a batch size of 1?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 3}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4547", "title": "Training Time Estimation for Large Language Models", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate an equation for the total compute required in FLOPs, and estimate the training time assuming 40% Model Flops Utilization (MFU).", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4551", "title": "HBM Sharding for MoE Serving", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a memory sharding and offloading strategy to maximize throughput for the skewed traffic while supporting full model routing.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4553", "title": "LoRA Adapter Memory Footprint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the memory overhead of the adapters against the KV cache assuming batch size 32, 2000 sequence length, 32 layers, and 4096 hidden dimension.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4554", "title": "GPU Deep Sleep Energy Savings", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total energy savings in kWh if 80 idle nodes (8 GPUs each) are put into deep sleep for 12 hours?", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4557", "title": "Ring AllReduce Bottleneck", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the expected communication time per step using Ring AllReduce on 16 nodes to diagnose the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4558", "title": "H100 Multimodal Pipeline Pre-fetching", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design a data loading and staging pipeline to prevent the compute units from starving while handling massive video IO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4561", "title": "Asynchronous Hierarchical Checkpointing for Trillion Parameter Models", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an asynchronous, hierarchical checkpointing system that minimizes blocking time on the GPUs.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4570", "title": "MI300X Throughput Stalls and Checkpoint I/O", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether this periodic stall is caused by the global data shuffling pipeline or an implicit checkpointing configuration.", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 3}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4571", "title": "MoE Autoscaling Cluster", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a serving cluster architecture and memory allocation specification that dynamically shifts resources without massive cold-start latencies.", "chain_ids": ["cloud-chain-auto-001-20"], "chain_positions": {"cloud-chain-auto-001-20": 2}, "chain_tiers": {"cloud-chain-auto-001-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4573", "title": "Hardware NVDEC Offloading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the quantitative bandwidth impact of shifting video decoding from CPU to the GPU's hardware NVDEC engines.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4578", "title": "PCIe Bottleneck for Swapping LoRA Adapters", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the latency penalty added to token generation strictly from fetching these adapters over PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0001", "title": "The Camera Data Deluge", "topic": "mlops-lifecycle", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Before any processing or compression, what is the approximate bandwidth you must provision to move raw pixel data from the sensor to the SoC's memory for this single camera stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~600 KB/s", "~6 MB/s", "~60 MB/s", "~6 GB/s"], "correct_index": 2}}, {"id": "edge-0002", "title": "The Sensor's Front Door Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the most likely component causing the dropped or corrupted frames before they reach main memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The LPDDR5 main memory bandwidth is too low to handle six streams.", "The Jetson's GPU is not powerful enough to process the incoming frames.", "The MIPI CSI-2 camera interconnect bandwidth is saturated.", "The NVMe SSD is too slow to store the incoming video frames."], "correct_index": 2}}, {"id": "edge-0003", "title": "The Sensor Bandwidth Limit", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the approximate maximum data rate you can expect to transfer from a standard 4-lane MIPI CSI-2 interface into the SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~205 GB/s", "~50 GB/s", "~2.5 GB/s", "~125 MB/s"], "correct_index": 2}}, {"id": "edge-0004", "title": "The Production Data Glitch", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What data rate does the 1920x1080 8-bit 200 FPS camera produce, and could USB 3.0 be causing the glitches and accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~41.5 MB/s. The data rate is low, so the bottleneck must be elsewhere.", "~3.3 Gbps. This is comfortably within the 5 Gbps theoretical limit of USB 3.0, so the interface is not the issue.", "~415 MB/s. This rate saturates the real-world throughput of a USB 3.0 interface, likely causing dropped frames and data corruption.", "~2.1 MB/s. The data rate is trivial; the issue is likely the model's processing speed on the Hailo-8."], "correct_index": 2}}, {"id": "edge-0005", "title": "The Automotive I/O Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which component typically offers higher data bandwidth: the AGX Orin's LPDDR5 memory system or the MIPI CSI-2 camera interface that feeds it?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 0}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly equal, as they are designed to be balanced.", "The LPDDR5 memory system, by about 80x.", "The MIPI CSI-2 camera interface, by about 10x.", "The LPDDR5 memory system, but only by a small amount (~2-3x)."], "correct_index": 1}}, {"id": "edge-0009", "title": "The Frame Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is your colleague dangerously wrong regarding the 30 FPS frame budget?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 2}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0013", "title": "The Jittery Robot Arm", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the common system-level culprits for this jitter, and how would you diagnose and mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0015", "title": "The AV Pipeline Stall", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most likely cause of the 5.5ms overhead you are observing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's operators are not fully supported by the Jetson AGX Orin's hardware, causing frequent CPU fallbacks.", "The 100 Gbps interconnect is the bottleneck; transferring the 64 MB feature map is taking ~5.1ms.", "Real-time OS scheduling jitter and CUDA kernel launch overhead are consuming the extra 5.5ms.", "The memory bandwidth on one of the Orin modules is saturated, slowing down its 12.5ms computation."], "correct_index": 1}}, {"id": "edge-0016", "title": "The AV Perception Pipeline Stall", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Given this data, how would you diagnose and solve this pipeline stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too slow. It must be quantized from FP16 to INT8 to reduce its runtime below 16ms per frame.", "The Jetson AGX Orin is memory bandwidth-bound. The 6 video streams are saturating the 204.8 GB/s LPDDR5 bus.", "The system is executing the 6 camera streams serially. The tasks should be parallelized using CUDA streams to run concurrently.", "The Jetson AGX Orin lacks sufficient compute. 275 TOPS isn't enough for 6 cameras and must be upgraded."], "correct_index": 2}}, {"id": "edge-0017", "title": "The Pipeline Overlap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Without changing any model or buying new hardware, how do you cut the per-frame latency?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 3}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0018", "title": "The Thermal Throttling Deadline Miss", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the latency double?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 2}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0020", "title": "The TensorRT vs ONNX Runtime", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should you switch to ONNX Runtime for faster deployment, or is there a better approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0021", "title": "The Rainy Day mAP Cliff", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does moderate rain cause such a catastrophic accuracy drop, and why is it a cliff rather than a slope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0023", "title": "The Camera-to-Inference Latency Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's going wrong, and how do you fix it without buying faster hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0024", "title": "The WCET Analysis Wall", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is measurement-based WCET insufficient for safety certification, and what does a valid WCET analysis require for a neural network?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 4}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0025", "title": "The RTOS vs RT-Linux Tradeoff", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Who is right, and how do you design a system to resolve this conflict?", "chain_ids": ["edge-chain-auto-001-09"], "chain_positions": {"edge-chain-auto-001-09": 2}, "chain_tiers": {"edge-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0026", "title": "The Hard Real-Time Challenge", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the software stack on an edge SoC to ensure deterministic performance, especially when running complex ML models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0028", "title": "The ROS 2 IPC Overhead", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is ROS 2 doing that costs 35ms, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0029", "title": "The Python GIL Multithreading Trap", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did Python multithreading fail to improve the framerate on the 4-core Raspberry Pi?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0030", "title": "The Adaptive Quality Ladder", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What system design maintains 30 FPS across varying scene complexities without changing hardware?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0031", "title": "The Solar Panel Degradation Budget Squeeze", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does a 24% panel degradation cause a 5-hour blackout, and how do you adapt the ML workload?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0033", "title": "Edge-Cloud Hybrid Inference Break-Even", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Calculate the break-even point — when does on-device become cheaper than cloud?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0034", "title": "The Multi-Tenant Edge Scheduler", "topic": "safety-certification", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a multi-tenant scheduler so loss prevention meets its deadlines during Black Friday?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0035", "title": "The Preemption Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why didn't the High-Priority ML Thread just preempt the Low-Priority thread?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0036", "title": "The WCET Analysis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you construct the worst-case execution time (WCET) argument?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 5}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0037", "title": "The Sensor Aging Silent Accuracy Rot", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you detect and correct fleet-wide gradual degradation that's invisible at the individual device level?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0041", "title": "The Post-Replacement Camera Miscalibration", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What did the technician miss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0043", "title": "The Dropped Frame Dilemma", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a robust, real-time sensor data pipeline that handles variable ML inference times and prevents data loss or desynchronization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0048", "title": "The YUV Conversion Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the NPU only running at 40% utilization while you are dropping frames?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 2}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0051", "title": "Sensor Fusion Latency Budget", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can you meet the 50ms deadline, and what is the maximum sensor-to-output latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0053", "title": "The Asynchronous Orchestra", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you synchronize these disparate sensor inputs for real-time ML perception without introducing excessive latency or data staleness, especially when ML inference itself takes time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0054", "title": "The Camera VSync Tearing", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is your latency violently oscillating between 12ms and 28ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0056", "title": "The Radar-Camera Fusion Latency", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where is the extra 25ms hiding, and when does the fusion latency overhead negate the benefit of having radar?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0060", "title": "The Perpetual Calibration Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you design a system that maintains high-precision sensor fusion and localization accuracy over half a decade without human intervention for recalibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0061", "title": "Diagnosis: Root Cause of Thermal Throttling in Sealed Edge Enclosures", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What caused the gradual performance degradation, and how can it be fixed?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 1}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0063", "title": "The Battery Saver", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you maximize battery life while ensuring reliable detection and responsiveness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0064", "title": "Battery Life for Solar-Powered Edge Device", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Estimate the daily energy budget and determine if the system can run indefinitely on the 10W solar panel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0065", "title": "The Power State Machine", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the power state machine to meet the <200ms wake target and 60-day battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0066", "title": "The Thermal Zone Juggle", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the sudden latency increase for the GPU model despite being under the system TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0067", "title": "The Silent Slowdown", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely cause of this 'silent slowdown,' and how would you design the system to guarantee sustained performance?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0068", "title": "The Overheating Robot Dog", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the robot dog's system to maintain acceptable navigation performance despite long-duration thermal constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0069", "title": "The Duty Cycling Power Budget", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you run a 45W workload on a 30W thermal budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0071", "title": "The Overheating Vision Pipeline", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely culprit for this performance drop, and how does it impact real-time design?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0072", "title": "The DVFS Latency Jitter", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's causing the latency spikes, and how do you guarantee the 25ms deadline?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 3}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0073", "title": "The Power-Over-Ethernet Budget", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much power budget remains for the AI accelerator under standard PoE, and which edge chips actually fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0075", "title": "Thermal Headroom in a Sealed Enclosure", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal headroom do you have in summer and winter, and what thermal-aware inference policy would you design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0076", "title": "The Thermal Throttling Dilemma", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the system to maintain a minimum acceptable performance level under varying thermal and power constraints without completely failing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0077", "title": "The Thermal Staircase", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does performance degrade in steps rather than gradually, and how do you design around it?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0078", "title": "The Thermal Derating Curve", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How is thermal derating causing the 30 FPS to 21 FPS drop, and what thermal resistance is required to safely sustain 25W at 45°C ambient?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 3}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0081", "title": "The Throttled Vision System", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the most likely cause of the sudden drop in NPU utilization and FPS during continuous operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0086", "title": "Thermal-Aware Inference Scheduler", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule 5 perception models to prevent thermal throttling while keeping safety-critical latency under 33ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0088", "title": "The Perpetual Sensor", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What architecture and power management strategies would let a motion-triggered wildlife camera run for 5 years on a small battery pack?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0089", "title": "The Energy-Aware Reconnaissance Drone", "topic": "compound-ai-systems", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design adaptive inference for a 6-hour drone mission when ML power drops to 20-30% during maneuvers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0090", "title": "The Solar-Powered Edge Budget", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What power budget and maximum inference rate can the 20W solar, 100Wh bird-classifier station sustain with 5 hours of sunlight per day?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0092", "title": "The TensorRT Engine Portability Trap", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What went wrong with the TensorRT engine migration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0093", "title": "The Model Pruning Speedup Myth", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 90% sparsity give no speedup on the Jetson Orin NX, and what kind of pruning would actually help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0094", "title": "The Optimization Ladder", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is designing a custom architecture the wrong first step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0095", "title": "The Pruning Paradox", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the same pruning give wildly different speedups on different hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0096", "title": "The Safety Watchdog Timer", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What caused the safety watchdog timer to fire despite the 35ms average inference time?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 2}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0097", "title": "The RTOS Interconnect Crisis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which IPC mechanisms would you use on the RTOS to meet real-time deadlines, move large buffers efficiently, and isolate process failures?", "chain_ids": ["edge-chain-auto-001-09"], "chain_positions": {"edge-chain-auto-001-09": 0}, "chain_tiers": {"edge-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0098", "title": "The Edge-Cloud Hybrid Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What 4G offloading policy should handle the 5% ambiguous crop images, and does the hybrid approach actually help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0099", "title": "The Deterministic Inference Mirage", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing the 2.7ms timing variation on identical inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0100", "title": "The Update Blind Spot", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary security flaw in an update design that deletes the old model before downloading and loading the new one, taking 45 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0101", "title": "The Memory Copy Ceiling", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the memory flow diagram, where are the 'missing' milliseconds being spent, and how do you hit the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0102", "title": "The Model Cloning Waste", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the VRAM allocation diagram, why is your system using 16% more memory than it needs to?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 3}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0104", "title": "The Sealed Oven Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on the thermal diagram, what is the physical flaw in your cooling strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0105", "title": "The Rolling Shutter Tear", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Based on the exposure diagram, what physical phenomenon is destroying your accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0106", "title": "The Memory Pressure Leak", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the producer-consumer diagram, why is your memory usage increasing over time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0107", "title": "The Memory Copy Choke", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the data path diagram, what is the 'silent' task consuming all your CPU cycles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0109", "title": "The Sequential Serializer", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the utilization diagram, how would you collapse this timeline to fit the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0110", "title": "The Bus Priority Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Based on the SoC architecture diagram, what physical component is causing the NPU to stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0111", "title": "The Thermal Throttle", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Between Chip A (80 TOPS peak, 40W TDP) and Chip B (25 TOPS peak, 8W TDP), which will deliver higher sustained performance within the drone's 10W thermal envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Chip A, providing 80 TOPS.", "Chip B, providing 100 TOPS.", "Chip A, providing 20 TOPS.", "Chip B, providing 25 TOPS."], "correct_index": 3}}, {"id": "edge-0112", "title": "The Case of the Missing Gigabytes", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Besides the model itself and the OS, what other major consumer of DRAM on an edge device accounts for the multi-gigabyte discrepancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Page file / swap space on the eMMC storage.", "Memory fragmentation from other running processes.", "The OS kernel, which typically uses a few hundred megabytes.", "A large, pre-allocated DMA buffer for the camera sensor stream."], "correct_index": 3}}, {"id": "edge-0114", "title": "The ViT Memory Wall on the Edge", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For the ViT, what is the most likely primary performance bottleneck that could cause it to miss the 33ms deadline?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 0}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The total number of FLOPs in the MLP blocks", "The flash storage required for the model's parameters", "Memory bandwidth saturation from the quadratic complexity of self-attention", "The latency of the initial patch embedding (stem) layer"], "correct_index": 2}}, {"id": "edge-0116", "title": "The Throttled Robot", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate sustained INT8 TOPS you can expect to achieve in this mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "~140 TOPS", "~70 TOPS", "15 TOPS"], "correct_index": 2}}, {"id": "edge-0117", "title": "Pruning for Parallelism", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To achieve a real-world speedup with the TensorRT runtime, which fundamental type of pruning should you recall as the most effective starting point?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0118", "title": "The 'Open-Case' Vulnerability", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which physical interface on the device's circuit board represents the most direct and highest-bandwidth point of attack for this data injection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The I2C bus used for sensor configuration.", "The encrypted LPDDR5 DRAM where the model is stored.", "The MIPI CSI-2 camera interface.", "The UART serial console used for debugging."], "correct_index": 2}}, {"id": "edge-0121", "title": "The INT8 Memory Footprint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 25M parameters, how much memory do the weights require in FP32 versus INT8, and how much is saved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 MB (FP32) and 25 MB (INT8), a 2x reduction.", "200 MB (FP32) and 25 MB (INT8), an 8x reduction.", "100 MB (FP32) and 25 MB (INT8), a 4x reduction.", "100 MB (FP32) and 12.5 MB (INT8), an 8x reduction."], "correct_index": 2}}, {"id": "edge-0122", "title": "The Transformer's Quadratic Curse", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does computational cost scale when increasing from 320x320 to 640x640 for a CNN versus a Vision Transformer (ViT)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both architectures scale linearly, leading to a ~4x increase in FLOPs.", "The CNN's cost scales ~4x, while the ViT's scales ~16x.", "The CNN's cost scales quadratically (~16x), while the ViT's is linear (~4x).", "The cost increase is negligible (~2x) for both due to hardware acceleration."], "correct_index": 1}}, {"id": "edge-0124", "title": "The Thermal Throttling Trap: Power Budgeting", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What sustained INT8 performance should you design around under the robot's 30W thermal limit, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["138 TOPS, because performance scales linearly with the power budget.", "275 TOPS, the device will just run hotter but should maintain its advertised performance.", "Around 170 TOPS, as the device selects its more efficient 30W DVFS power profile.", "26 TOPS, which is the peak performance of other efficient edge accelerators in that power range."], "correct_index": 2}}, {"id": "edge-0125", "title": "The Structured Sparsity Speedup", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What theoretical speedup should 2:4 structured pruning deliver for this compute-bound model in TensorRT?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No speedup, it only saves memory.", "1.5x speedup, due to framework overheads.", "2x speedup.", "4x speedup."], "correct_index": 2}}, {"id": "edge-0126", "title": "The Compromised Robot", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "As the ML systems engineer, what is the most fundamental and effective first line of defense you should explain to your team to prevent the robot from ever running this maliciously modified model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Encrypt the model file stored on the robot's flash memory.", "Retrain the vision model using adversarial training to make it robust to sticker attacks.", "Enable Secure Boot to create a hardware-rooted chain of trust.", "Configure a firewall on the robot to block all incoming network traffic."], "correct_index": 2}}, {"id": "edge-0127", "title": "The Power Efficiency Fallacy", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Between a Jetson AGX Orin and a Hailo-8, which device is fundamentally more power-efficient in terms of TOPS/W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Jetson AGX Orin, because its 275 TOPS is over 10x higher than the Hailo-8's 26 TOPS.", "The Hailo-8, because it delivers over 2x the TOPS/W compared to the Jetson.", "They are roughly equivalent in efficiency; the choice depends on other factors.", "It's impossible to tell without knowing the specific model architecture."], "correct_index": 1}}, {"id": "edge-0128", "title": "The Edge Roofline Trap", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on their hardware specs, which device, a Jetson AGX Orin or a Hailo-8, is more likely to be bottlenecked by memory bandwidth rather than its compute units?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Hailo-8, because its compute (26 TOPS) is lower, making it the bottleneck. (Calculated Trap: ignores Ridge Point)", "The Jetson AGX Orin, because its Ridge Point is much higher than classic CNN Arithmetic Intensity.", "Neither, as both use modern memory systems that eliminate bottlenecks. (Calculated Trap: ignores Roofline Model entirely)", "The device with more RAM, as it will try to process more data at once. (Calculated Trap: confuses capacity with bandwidth)"], "correct_index": 1}}, {"id": "edge-0129", "title": "The Unified Memory Contention", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What fundamental hardware constraint is causing the unstable latency and system lag?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU's dedicated HBM memory is too small for the model.", "The CPU and GPU are competing for access to the same shared LPDDR5 DRAM.", "The PCIe bus connecting the CPU and GPU is saturated.", "The Linux OS is swapping model memory to the NVMe drive."], "correct_index": 1}}, {"id": "edge-0130", "title": "The CPU-Free Camera Ingest", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hardware feature lets the camera write frames into system DRAM for the GPU without CPU data copies?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 0}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A high-priority CPU thread performs a `memcpy` from the camera to DRAM.", "The GPU reads pixels directly from the camera sensor's private SRAM.", "Direct Memory Access (DMA) controllers manage the transfer independent of the CPU.", "The camera data is sent to the GPU through a series of L1 cache line fills."], "correct_index": 2}}, {"id": "edge-0131", "title": "The Edge Robot's Memory Tax", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "After the 6 GB system reservation, will the 5B-parameter FP16 model plus 4 GB of activations fit in DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the model's 5B parameters require 20 GB in FP16, which is too large for the 26 GB available after activations.", "No, it won't fit because the 14 GB model requirement exceeds the total system DRAM.", "Yes, it requires 14 GB (10 GB for FP16 weights + 4 GB for activations), which is less than the 26 GB of available DRAM.", "Yes, it fits because the model's 10 GB of weights is less than the total 32 GB of DRAM."], "correct_index": 2}}, {"id": "edge-0132", "title": "The High-Speed Camera's DMA Budget", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the DMA transfer time for one 4K RGB frame over the 4-lane MIPI CSI-2 bus, and does it fit the 60 FPS frame budget?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 1}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the transfer takes ~80 ms, which exceeds the 16.7 ms budget.", "Yes, the transfer takes ~10 ms, which is well within the 16.7 ms budget.", "No, the transfer time is only ~0.12 ms because it uses the Jetson's 204.8 GB/s DRAM bandwidth, but the processing will be the bottleneck.", "Yes, the transfer is nearly instantaneous because DMA operations don't consume bus bandwidth."], "correct_index": 1}}, {"id": "edge-0133", "title": "The 30 FPS Deadline: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the maximum latency budget for the entire ML inference pipeline for a single frame to meet the 30 FPS hard real-time requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16.6 ms", "30 ms", "33.3 ms", "100 ms"], "correct_index": 2}}, {"id": "edge-0134", "title": "Worst-Case vs. Average-Case", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Does this system satisfy the real-time requirement?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 0}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, because the average latency (28ms) is well below the 33ms deadline.", "No, because the worst-case latency (45ms) exceeds the 33ms deadline.", "Yes, because the Jetson AGX Orin has over 200 TOPS, which is sufficient.", "It's impossible to say without knowing the P99.9 latency."], "correct_index": 1}}, {"id": "edge-0135", "title": "The 30 FPS Frame Deadline: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum frame budget in milliseconds for the entire vision pipeline to meet a 30 FPS requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16.7 ms", "30 ms", "33.3 ms", "0.033 ms"], "correct_index": 2}}, {"id": "edge-0136", "title": "Throughput vs. Latency", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 26 TOPS Hailo-8 run a 2 TOPs-per-frame model at 30 FPS, and what maximum FPS can it achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, 26 TOPS is much greater than the 2 TOPS required.", "No, it can only achieve about 7.7 FPS.", "No, it can only achieve 13 FPS.", "Yes, it can run at 520 FPS."], "correct_index": 2}}, {"id": "edge-0137", "title": "The Thermal Handcuffs", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the *maximum sustained performance* you can realistically expect from this chip under that thermal constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4.6 TOPS", "275 TOPS", "69 TOPS", "0.3 TOPS"], "correct_index": 2}}, {"id": "edge-0138", "title": "Crossing the Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Is this model compute-bound or memory-bound on the Orin given 2,000 Ops/Byte versus a 1,342 Ops/Byte ridge point?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is Memory-Bound.", "The model is Compute-Bound.", "The model is I/O-Bound.", "The model is Thermal-Bound."], "correct_index": 1}}, {"id": "edge-0139", "title": "The Orin Utilization Puzzle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this 50 GOps, 100 MB model on the Jetson AGX Orin limited by compute or memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is memory-bound because its Arithmetic Intensity (~500 Ops/Byte) is less than the Orin's Ridge Point (~1342 Ops/Byte).", "The model is compute-bound because the Jetson Orin has a very high peak compute of 275 TOPS.", "The model is compute-bound because its Arithmetic Intensity is high (50 Giga-ops is a large number).", "The model is memory-bound because its power consumption (TOPS/W) would be too high otherwise."], "correct_index": 0}}, {"id": "edge-0140", "title": "The Edge SRAM Speed Advantage", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much faster is accessing this on-chip SRAM compared to accessing the main system LPDDR5 DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x faster.", "~300x faster.", "~25x faster.", "They are about the same speed."], "correct_index": 2}}, {"id": "edge-0141", "title": "Defining the Tensor Arena", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary purpose of this memory region?", "chain_ids": ["edge-chain-auto-secondary-017-43"], "chain_positions": {"edge-chain-auto-secondary-017-43": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To store the model's immutable weight parameters read from flash.", "To dynamically allocate memory for activations using `malloc` as needed.", "To provide a static memory block for all activation tensors, avoiding `malloc`.", "To serve as a high-speed cache for the main system DRAM."], "correct_index": 2}}, {"id": "edge-0142", "title": "The SRAM Tensor Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the peak tensor arena memory usage with buffer reuse, and will it fit in 256 KB of SRAM?", "chain_ids": ["edge-chain-auto-secondary-017-43"], "chain_positions": {"edge-chain-auto-secondary-017-43": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["288 KB, because you sum all the tensors (96+128+64). It will not fit.", "128 KB, because that is the size of the largest tensor. It will fit.", "224 KB, because the peak is the sum of the input and output of the first layer (96+128). It will fit.", "112 KB, because you calculated with INT8 instead of FP16. It will fit."], "correct_index": 2}}, {"id": "edge-0143", "title": "The DMA Offload Dividend", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How long does a 300 KB transfer take using a 10 GB/s CPU memcpy versus a 100 GB/s DMA transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU takes ~1.5 µs, the DMA takes ~0.3 µs. Both are very fast.", "The CPU takes 30 µs, the DMA takes 3 µs.", "Both take roughly the same time, ~1.5 µs, as they are limited by the device's 204.8 GB/s peak memory bandwidth.", "The CPU takes ~240 µs, the DMA takes ~24 µs, because you confused GB/s with Gb/s."], "correct_index": 1}}, {"id": "edge-0145", "title": "The Stereo Vision Memory Squeeze", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Will quantizing two 50 MB FP16 models to INT8 fit within the 60 MB SRAM weight budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the total size is still 100 MB, which is over the 60 MB budget.", "Yes, the total size will be 25 MB, leaving plenty of extra space.", "Yes, the total size will be 50 MB, which fits within the 60 MB budget.", "No, the quantization only provides a 1.5x reduction, resulting in a total size of ~67 MB, which is still too large."], "correct_index": 2}}, {"id": "edge-0148", "title": "The Perception of Speed on Edge Devices", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of the following metrics is the most crucial to minimize to address the user's perception of slowness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time to First Token (TTFT)", "Time per Output Token (TPOT)", "Peak Memory Usage", "Model FLOPS"], "correct_index": 0}}, {"id": "edge-0149", "title": "The Real-Time Deadline Trap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is batching two frames feasible when single-frame inference is 20 ms and the real-time deadline is 33 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, because the amortized time per frame is still 20ms, which is under the deadline.", "No, the total processing time for the batch (40ms) exceeds the 33ms real-time deadline.", "Yes, because batching increases compute efficiency and overall FPS.", "Yes, because the time per frame in the batch becomes 10ms (20ms / 2), which is faster."], "correct_index": 1}}, {"id": "edge-0150", "title": "The Continuous Batching Queue", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "When will Frame A's result be ready, and what is its total latency from arrival at T=10ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30ms, the time it takes for the model to run inference.", "40ms, the time from arrival until the batch is dispatched.", "70ms, the clock time when the inference finishes, yielding a 60ms total latency.", "10ms, because the frame is processed immediately."], "correct_index": 2}}, {"id": "edge-0151", "title": "The Edge Device Power Average", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is its average power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.375 W", "500 mW", "700 mW", "2.5 W"], "correct_index": 2}}, {"id": "edge-0152", "title": "The Passive Cooling Limit", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "To prevent the system from overheating, what is the maximum sustainable duty cycle (the percentage of time the accelerator can be active)?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100%", "60%", "40%", "2.5%"], "correct_index": 2}}, {"id": "edge-0153", "title": "The Kernel Launch Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary overhead that this operator fusion reduces?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 0}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Total floating-point operations (FLOPs) of the model.", "The model's memory footprint on disk.", "Kernel launch overhead and DRAM traffic between layers.", "The peak power draw of the accelerator."], "correct_index": 2}}, {"id": "edge-0154", "title": "The Fusion Overhead Fallacy", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If you fuse these three operations into a single kernel, what is the approximate latency speedup assuming a 5µs kernel launch overhead?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 1}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~14.3% speedup", "~28.6% speedup", "~50.0% speedup", "10µs"], "correct_index": 1}}, {"id": "edge-0156", "title": "The Watchdog Timer and the Checkpoint Tax", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does the 1ms checkpoint affect the 50ms watchdog budget, and what maximum inference latency is allowed when checkpointing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["43ms", "49ms", "42ms", "42.9ms"], "correct_index": 2}}, {"id": "edge-0157", "title": "The Dusty Lens Problem", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much of the 33ms budget is spent just reading one 1920x1080 8-bit grayscale frame from DRAM for contrast normalization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~830 µs", "~1.2 ms", "~10 µs", "~81 µs"], "correct_index": 2}}, {"id": "edge-0158", "title": "The Communication Tax of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From an operational cost perspective, what factor do you need to identify as the primary driver of your budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The compute cost of aggregating the one million models on the central server.", "The power drawn by the one million edge devices to train the model locally.", "The network bandwidth cost to transfer all model updates to the cloud.", "The storage cost for the historical archive of all global models."], "correct_index": 2}}, {"id": "edge-0159", "title": "The Federated vs. Centralized Upload Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 1,000 cameras, how do daily upload volumes compare between centralized image uploads and federated MobileNetV3 weight updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: ~100 MB, Federated: ~8 MB. Centralized is more expensive but only by about 12.5x.", "Centralized: ~100 GB, Federated: ~16 GB. Centralized is ~6.25x more expensive.", "Centralized: ~100 GB, Federated: ~8 GB. Centralized is ~12.5x more expensive.", "Centralized: ~100 GB, Federated: ~4 GB. Centralized is ~25x more expensive."], "correct_index": 2}}, {"id": "edge-0160", "title": "The Edge Efficiency Metric", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate TOPS-per-Watt efficiency of a Hailo-8 accelerator at its nominal power envelope?", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1 TOPS/W", "~4.5 TOPS/W", "~10 TOPS/W", "~26 TOPS/W"], "correct_index": 2}}, {"id": "edge-0161", "title": "The Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a CNN at 500 Ops/Byte on hardware with a 1,342 Ops/Byte ridge point, is the bottleneck memory or compute?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 0}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute (TOPS)", "Memory Bandwidth (GB/s)", "Power Consumption (Watts)", "On-chip Interconnect"], "correct_index": 1}}, {"id": "edge-0162", "title": "The Edge Ridge Point", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you calculate the Orin's INT8 ridge point from 275 TOPS and 204.8 GB/s, and how does it separate memory-bound from compute-bound models?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 1}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.34 Ops/Byte (Dividing 275 / 204.8 directly without unit conversion).", "~0.0007 Bytes/Op (Inverting the formula: Bandwidth / Compute).", "~1343 Ops/Byte. A model's AI must be higher than this to be compute-bound.", "~1343 Ops/Byte. A model's AI must be lower than this to be compute-bound."], "correct_index": 2}}, {"id": "edge-0163", "title": "The On-Chip vs. Off-Chip Memory Chasm", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is an LPDDR5 DRAM read than an on-chip L2 cache hit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2-3x slower", "~25x slower", "~100x slower", "The speed is the same, just the capacity is different"], "correct_index": 1}}, {"id": "edge-0164", "title": "The Edge LLM's Memory Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If the device needs to support a context window of 4096 tokens in FP16 precision, how much of the Jetson's VRAM will be consumed solely by the KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~402 MB", "~4 GB", "~805 MB", "~1.6 GB"], "correct_index": 2}}, {"id": "edge-0165", "title": "The TinyML Tensor Arena Calculation", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum required size for the tensor arena to execute this specific operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40 KB", "95 KB", "65 KB", "256 KB"], "correct_index": 2}}, {"id": "edge-0167", "title": "The Quantization Memory Payoff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What percentage reduction in weight memory should you expect when quantizing 350M parameters from FP16 to INT8?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A 75% reduction.", "A 100% reduction.", "A 50% reduction.", "A 25% reduction."], "correct_index": 2}}, {"id": "edge-0168", "title": "The Edge Transformer Parameter Tax", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Which first-layer design is more memory-efficient for the 64x64 patch model: self-attention or a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Transformer is cheaper; its parameter cost is 128*128 which is less than the 3*3*128*128 of a standard CNN.", "The CNN is cheaper; it has ~17.5k parameters while the Transformer has ~65.5k, a ~3.7x difference.", "They are roughly equal; the Transformer 4*128*128 cost is similar to a standard convolution 3*3*128*128 cost.", "The Transformer is cheaper; the attention calculation softmax(Q*K^T) has no parameters, making it more efficient."], "correct_index": 1}}, {"id": "edge-0169", "title": "The Real-Time Batching Tax", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental trade-off you must immediately identify for a real-time system like this?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 0}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It strictly improves throughput by amortizing computation, with no significant impact on individual frame latency.", "It primarily reduces the power consumption of the edge device by increasing utilization.", "It increases system throughput but also increases the processing latency for every single frame.", "It mainly increases the required on-chip memory, which is the key constraint."], "correct_index": 2}}, {"id": "edge-0171", "title": "The Continuous Batching Sweet Spot", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What batch size maximizes throughput while keeping worst-case frame latency within 150ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8", "23", "11", "As large as memory allows"], "correct_index": 0}}, {"id": "edge-0172", "title": "The Duty Cycle Power Calculation", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the average power consumption over this 10-second active/sleep cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.0 mW (Calculated Trap: Ignoring sleep power)", "~5.0 mW (Calculated Trap: Unweighted average of power states)", "~1.01 mW", "~10.0 mW (Calculated Trap: Ignoring the duty cycle entirely)"], "correct_index": 2}}, {"id": "edge-0173", "title": "The Edge Compute Ceiling", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Before analyzing the model architecture, what is the advertised peak INT8 compute performance of a single Jetson AGX Orin device?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["26 TOPS", "989 TOPS", "275 TOPS", "100 TOPS"], "correct_index": 2}}, {"id": "edge-0174", "title": "The Watchdog Timer's Deadline", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What does the 500ms watchdog do, and what maximum inference latency remains after reserving 150ms for clean shutdown?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 0}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["500ms. The entire watchdog window is the latency budget.", "650ms. The shutdown time is added to the watchdog timeout.", "350ms. The safe shutdown time must be subtracted from the total budget.", "150ms. The inference budget is equal to the safe shutdown time."], "correct_index": 2}}, {"id": "edge-0175", "title": "The Data Gravity of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single largest operational cost avoided by choosing Federated Learning?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cost of on-device compute for local training.", "The cost of the central server to aggregate model updates.", "The cost of network data transfer to and from the devices.", "The initial hardware cost (CapEx) of the devices."], "correct_index": 2}}, {"id": "edge-0177", "title": "The Efficiency Metric", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing two different chips, what is the single most important efficiency metric to consider?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The chip's maximum theoretical compute performance (peak TOPS).", "The memory bandwidth available per watt.", "Power efficiency: the operations delivered per watt of power consumed.", "The financial cost per TOP of performance ($/TOP)."], "correct_index": 2}}, {"id": "edge-0178", "title": "The Bottleneck Identity", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Is this element-wise operation typically compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Neither, it's bound by the PCIe bus.", "Compute-bound", "Memory-bound", "It is always compute-bound on a GPU."], "correct_index": 2}}, {"id": "edge-0179", "title": "The Perception Model's Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the 50 GOPS per 50 MB inference workload memory-bound or compute-bound on the AGX Orin, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The AI is 1000 Ops/Byte, making the model memory-bound.", "The AI is 1000 Ops/Byte, making the model compute-bound.", "The AI is 1342 Ops/Byte, making the model compute-bound.", "The AI is 1 Op/Byte, making the model memory-bound."], "correct_index": 0}}, {"id": "edge-0180", "title": "The SRAM Tensor Arena", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary purpose of the Tensor Arena memory region?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To permanently store the model's weights in fast memory.", "To serve as a dedicated buffer for Direct Memory Access (DMA) transfers from peripherals.", "To provide a statically-allocated region for tensor activations, avoiding dynamic memory calls.", "To act as a software-managed cache for data stored in slower, external DRAM."], "correct_index": 2}}, {"id": "edge-0181", "title": "The ADAS VRAM Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM do the 44M FP16 parameters require, and what total is needed after adding 14 MB of activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~88 MB. The activation memory is temporary and doesn't count towards the total.", "~704 MB. You need to use the training memory rule of thumb (16 bytes/param).", "~102 MB. The total is the sum of parameter and peak activation memory.", "~58 MB. It's 44MB for weights (INT8) plus 14MB for activations."], "correct_index": 2}}, {"id": "edge-0182", "title": "The Real-Time Sensor Fusion Dilemma", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Where should the 2 MB INT8 weights be stored given 1 MB SRAM, and what hardware should move camera data without CPU bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Store weights in DRAM and have the CPU perform a `memcpy` to move camera data.", "Store weights in DRAM; use a DMA controller to move data from camera to DRAM and from DRAM to SRAM for processing.", "The model won't fit because the 2 MB of weights are larger than the 1 MB of SRAM.", "Store weights in DRAM and have the compute engine access them directly from DRAM for every calculation."], "correct_index": 1}}, {"id": "edge-0183", "title": "The Quantization Energy Dividend", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate energy saving for a single compute operation when switching from FP32 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2× more efficient", "~4× more efficient", "~18× more efficient", "~100× more efficient"], "correct_index": 2}}, {"id": "edge-0186", "title": "The Perception Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the vehicle can react safely to its environment, what is the maximum permissible end-to-end latency for processing a single frame, often referred to as the 'frame budget'?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 0}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "16 ms", "33 ms", "1 ms"], "correct_index": 2}}, {"id": "edge-0187", "title": "The Real-Time Deadline Violation", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If batch size doubles to 2 and inference scales linearly from 20ms, can the system still meet the 33ms frame deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system becomes more efficient because average throughput increases.", "The system violates its deadline because the batch processing time exceeds the per-frame budget.", "The system is fine because the average processing time per frame is still 20ms.", "The deadline is missed, but it is okay because the GPU is more utilized."], "correct_index": 1}}, {"id": "edge-0188", "title": "The Continuous Batching Throughput Advantage", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which gives higher TPOT for variable arrivals: static batching of 8 with a 10ms timeout or continuous batching, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Static batching is better because a larger batch size of 8 is more hardware-efficient.", "Continuous batching is better because it minimizes GPU idle time by processing requests from a queue as soon as capacity is available.", "The throughput will be identical because the underlying hardware performance is the same.", "Static batching with a very short timeout (e.g., 1ms) would be better than continuous batching."], "correct_index": 1}}, {"id": "edge-0190", "title": "The Corrupted Sensor Stream", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How can switching from a 4-lane to 2-lane MIPI CSI-2 interface hurt accuracy, and how do their data rates compare if 4 lanes provide 2.5 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 2-lane interface has half the bandwidth (1.25 GB/s), but this only increases latency and doesn't change the image data itself.", "The camera's data rate is only 2.5 GB/s, so it won't be a problem for either interface.", "The 2-lane interface has half the bandwidth (1.25 GB/s), which may force the system to apply lossy compression or drop frames, causing a training-serving skew.", "The MIPI interface bandwidth is irrelevant; the bottleneck would be the Jetson AGX Orin's memory bandwidth (204.8 GB/s), which is much higher."], "correct_index": 2}}, {"id": "edge-0191", "title": "The Energy Cost of Data Movement", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how much more energy is consumed by a single off-chip DRAM access compared to a single FP16 computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10x", "Compute is ~10x more expensive", "~580x", "~50,000x"], "correct_index": 2}}, {"id": "edge-0192", "title": "The Power Efficiency Metric", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing two chips, like a Jetson AGX Orin and a Hailo-8, what is the single most important metric for evaluating power efficiency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak TOPS", "TOPS/W", "Memory Bandwidth (GB/s)", "TDP (Thermal Design Power)"], "correct_index": 1}}, {"id": "edge-0193", "title": "The Roofline Litmus Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Given an arithmetic intensity of 600 Ops/Byte and an INT8 ridge point of 1,342 Ops/Byte, what is the primary performance bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound", "Memory-bound", "Network-bound", "Power-bound"], "correct_index": 1}}, {"id": "edge-0195", "title": "The Edge VLM's KV-Cache Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM is required for the FP16 KV-cache with 24 layers, 16 heads, head dimension 128, and 4096 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["384 MiB", "1536 MiB", "768 MiB", "805 MiB"], "correct_index": 2}}, {"id": "edge-0196", "title": "The Radar's SRAM Ingestion Time", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Interpret the bus specification and calculate the minimum time required to DMA a 256 KB frame over a 2.5 GB/s MIPI bus.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~102 ms (Unit Error: Confusing GB/s with MB/s)", "~1 µs (Concept Error: Quoting bus latency instead of calculating transfer time)", "~105 µs", "~819 µs (Unit Error: Confusing Gigabytes (GB) with Gigabits (Gb))"], "correct_index": 2}}, {"id": "edge-0197", "title": "The Energy Cost of Precision: Extreme Quantization", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how much energy does a compute operation save when switching from FP32 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Around 4×, because it uses 4 times fewer bits.", "Around 3-4×, the savings for using half-precision (FP16).", "Around 18×, due to the complexity of floating-point vs. integer logic.", "Over 100×, similar to the latency gap between cache and DRAM."], "correct_index": 2}}, {"id": "edge-0200", "title": "The 30 FPS Rule", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To maintain a smooth 30 frames-per-second (FPS) processing rate, what is the hard real-time latency budget for a single frame of computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "16 ms", "33 ms", "1 ms"], "correct_index": 2}}, {"id": "edge-0201", "title": "The Sensor Fusion Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum acceptable system latency before the system violates its deadline and risks a critical failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["33 ms", "28 ms", "5 ms", "61 ms"], "correct_index": 2}}, {"id": "edge-0202", "title": "The Continuous Batching Throughput Gain", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the percentage increase in overall request throughput achieved by using continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0%", "55%", "45%", "120%"], "correct_index": 3}}, {"id": "edge-0203", "title": "The Duty Cycle Power Gap", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate active-to-deep-sleep power consumption ratio for a TinyML microcontroller in this acoustic sensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10x", "100x", "1,000x", "100,000x"], "correct_index": 2}}, {"id": "edge-0204", "title": "The Sensor Pipeline Skew", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary ML systems risk of enabling lossy MJPEG compression before the perception model and disk logging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The raw data rate is ~237 MB/s. The risk is that this is still too high for some storage systems.", "The raw data rate is ~373 MB/s. This exceeds the bandwidth of a 4-lane MIPI CSI-2 bus, so compression is mandatory.", "The raw data rate is ~373 MB/s. The key risk is training-serving skew, as the model was not trained on the compression artifacts introduced by MJPEG.", "The raw data rate is ~3,800 MB/s. The main risk is overwhelming the Jetson AGX Orin's memory bandwidth."], "correct_index": 2}}, {"id": "edge-0205", "title": "The Privacy-Preserving Driver Cam", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a system design perspective, what is the primary reason a company would choose Federated Learning over centralized data collection for driver monitoring?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce total training compute cost by distributing it to the edge.", "To achieve higher model accuracy than is possible with centralized training.", "To preserve user privacy by keeping raw video data on the vehicle.", "To allow model training even when vehicles have zero network connectivity."]}}, {"id": "edge-0207", "title": "The TOPS vs. TOPS/W Tradeoff", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing chips, what does the 'TOPS/W' (tera-operations per second per watt) metric fundamentally represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak performance: the maximum theoretical compute power of the chip.", "Computational efficiency: the number of operations delivered per watt of power.", "Memory bandwidth: how quickly the chip can access its memory.", "Economic cost: the price of the chip per unit of performance."], "correct_index": 1}}, {"id": "edge-0208", "title": "The Sustained TOPS Reality Check", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "At 15W and 4.6 TOPS/W, what realistic sustained INT8 TOPS should you budget for instead of the 275 TOPS peak?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "4.6 TOPS", "69 TOPS", "26 TOPS"], "correct_index": 2}}, {"id": "edge-0213", "title": "The Voice Assistant's First Word", "topic": "ota-firmware-updates", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the driver perceives the system as highly responsive and 'instant', which of the following metrics is the most critical to minimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT)", "Time To First Token (TTFT)", "End-to-End Generation Time", "Achieved Batch Throughput (requests/sec)"], "correct_index": 1}}, {"id": "edge-0214", "title": "The Real-Time Batching Fallacy", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why will batching two 30 FPS frames violate the 33ms deadline even though single-frame inference takes 20ms?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 1}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It might work if the accelerator is efficient, as total throughput increases.", "It fails because the processing time for two frames (2 x 20ms = 40ms) is longer than the 33ms deadline.", "It fails because the system must wait 33ms for the second frame to arrive before starting the ~35ms batch inference, making the first frame's result available at ~68ms.", "It will work, because the average latency per frame in the batch is less than 33ms."], "correct_index": 2}}, {"id": "edge-0215", "title": "The Hidden Cost of Continuous Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With a 15ms batching timeout and 20ms inference, what latency does an idle single frame see, and does it meet the 33ms deadline?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The frame misses its deadline, as its total latency becomes 15ms (wait) + 20ms (inference) = 35ms.", "The frame meets its deadline, as its inference time is only 20ms, well within the 33ms budget.", "It's impossible to know without knowing the batching overhead of the inference server.", "The frame meets its deadline, because the timeout (15ms) is less than the inference time (20ms)."], "correct_index": 0}}, {"id": "edge-0216", "title": "The Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure TCO (Total Cost of Ownership) perspective on an edge fleet, what is the primary economic reason a company might choose Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It saves power on the edge device by requiring less on-device computation.", "It reduces cloud compute costs by pre-aggregating gradients on the edge.", "It dramatically reduces network bandwidth costs by avoiding raw video uploads.", "It lowers the direct cost of compliance with privacy regulations."], "correct_index": 2}}, {"id": "edge-0221", "title": "Sizing an SRAM Tensor Arena for DMA", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum required size for the tensor arena to run this model without running out of memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["64 KB (Largest single tensor)", "112 KB (Peak concurrent memory)", "160 KB (Sum of all tensors)", "96 KB (Peak at first layer only)"], "correct_index": 1}}, {"id": "edge-0223", "title": "The Activation Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the FP16 and INT8 memory footprints for an 80x80x256 activation map, and what memory reduction does INT8 provide?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~6.5 MiB (FP16) and ~1.6 MiB (INT8), a 4x reduction", "~1.6 MiB (FP16) and ~0.8 MiB (INT8), a 2x reduction", "~3.1 MiB (FP16) and ~1.6 MiB (INT8), a 2x reduction", "~3.1 MiB (FP16) and ~0.4 MiB (INT8), an 8x reduction"], "correct_index": 2}}, {"id": "edge-0226", "title": "The Edge vs. Cloud Power Divide", "topic": "energy-per-operation", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure hardware power perspective, roughly how much more power does a single high-end Cloud GPU consume compared to a typical automotive-grade Edge GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 3-5x more power", "About 100x more power", "Over 20x more power", "They consume roughly the same amount of power"], "correct_index": 2}}, {"id": "edge-0229", "title": "The Drone's Data Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the model's arithmetic intensity, and what does it measure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The model requires 1 TOPS, and the Orin only has 275 TOPS, so it's clearly limited by compute.", "Memory-bound. Its AI of 500 Ops/Byte is less than the Orin's ridge point (~1,342 Ops/Byte), so performance is limited by memory bandwidth.", "Compute-bound. Its AI of 500 Ops/Byte is less than the Orin's ridge point, meaning the processor can't keep up with the required operations.", "Neither. With over 200 GB/s of bandwidth, the device can supply the 2 GB of data almost instantly, so there is no bottleneck."], "correct_index": 1}}, {"id": "edge-0234", "title": "The 30 FPS Perception Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a widely recognized industry-standard deadline for processing a single frame in such a system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 ms", "33 ms", "100 ms", "250 ms"], "correct_index": 1}}, {"id": "edge-0236", "title": "The ISP Night-Driving Skew", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which hardware component in the sensor-to-SoC pipeline is the most likely cause of this training-serving skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The main ML accelerator (e.g., the Jetson's Tensor Cores)", "The DRAM memory controller", "The Image Signal Processor (ISP)", "The MIPI CSI-2 physical interface"], "correct_index": 2}}, {"id": "edge-0237", "title": "The Federated Fleet's Billion-Dollar Upload Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single largest cost that Federated Learning directly addresses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["On-device Compute Power", "Cloud Storage Cost", "Data Transmission Cost (Cellular/Ingress)", "Privacy Compliance Overhead"], "correct_index": 2}}, {"id": "edge-0239", "title": "The Fusion Dividend", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary purpose of applying operator fusion in this context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To decrease the model's parameter count and memory footprint on disk.", "To combine sequential operations into a single kernel, reducing DRAM traffic and latency.", "To increase the model's arithmetic complexity (FLOPs) to achieve higher accuracy.", "To simplify the model's Python code by abstracting multiple layers into a single function call for readability."], "correct_index": 1}}, {"id": "edge-0240", "title": "The Fusion Overhead Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What DRAM traffic do the unfused and fused 1 MB Conv-BatchNorm-ReLU sequence require, and what reduction does fusion achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2 MB", "4 MB", "5 MB", "0 MB, because fusion optimizes compute, not memory."], "correct_index": 1}}, {"id": "edge-0241", "title": "The Fundamental Bottleneck Metric", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the fundamental metric you would calculate to make this determination?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["TOPS/W", "P99 Latency (ms)", "Arithmetic Intensity (Ops/Byte)", "Total Activation Memory (MB)"], "correct_index": 2}}, {"id": "edge-0244", "title": "The INT8 Energy Dividend (edge-0244)", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure hardware physics perspective, what is the approximate energy saving for a single compute operation when using INT8 versus FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x. This matches the exact memory capacity reduction (32 bits to 8 bits).", "~18x. This reflects the exponential difference in transistor switching logic between floating-point and integer ALUs.", "~16x. This reflects a squared relationship of the bit reduction.", "~256x. This assumes energy scales exponentially with the number of representable values (2^8 vs 2^32)."], "correct_index": 1}}, {"id": "edge-0247", "title": "The Hard Real-Time Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following metrics is the most important to define as your primary optimization target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Maximizing batch throughput (Frames Per Second)", "Minimizing average latency (P50)", "Minimizing worst-case latency (P99/P100)", "Maximizing power efficiency (TOPS/Watt)"], "correct_index": 2}}, {"id": "edge-0249", "title": "The Federated Fleet's Data Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of the following daily data collection strategies results in the highest data cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Uploading 10MB of model gradients per robot via Federated Learning.", "Uploading 500MB of raw camera images per robot for central training.", "Uploading 1MB of summary statistics per robot from on-device analytics.", "Triggering a 'debug dump' on 1% of the fleet, uploading a 1GB snapshot from each."], "correct_index": 1}}, {"id": "edge-0251", "title": "The Edge Roofline Riddle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a typical CNN, is the workload likely to be compute-bound or memory-bound on this hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the GPU has 275 TOPS and CNNs are computationally intensive.", "Power-bound, because TOPS/W is the most critical metric for an autonomous vehicle.", "Memory-bound, because the model's arithmetic intensity is lower than the hardware's ridge point.", "Neither, a well-optimized model should be balanced perfectly on the ridge point."], "correct_index": 2}}, {"id": "edge-0252", "title": "The Edge Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the Jetson AGX Orin's ridge point in Ops/Byte given 275 TOPS of INT8 performance and 204.8 GB/s memory bandwidth?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.3 Ops/Byte", "~18.3 Ops/Byte", "~1343 Ops/Byte", "~0.74 Bytes/Op"]}}, {"id": "edge-0255", "title": "The INT8 Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What will the total memory footprint of the model's weights be after quantizing from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4 GB", "16 GB", "8 GB", "12 GB"], "correct_index": 2}}, {"id": "edge-0258", "title": "The Cost of Data Gravity in Automotive Fleets", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary operational cost that using Federated Learning is designed to mitigate compared to a traditional, fully centralized approach where all data is uploaded to the cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud compute (GPU) costs for model training.", "On-device compute energy consumed during local training.", "Network data transmission (backhaul) costs.", "Cloud storage costs for the raw sensor data."], "correct_index": 2}}, {"id": "edge-0261", "title": "The Autonomous Driving Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this specific layer compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the hardware's ridge point is only ~1.3 Ops/Byte, which the workload far exceeds.", "Compute-bound, because the 400 Giga-Ops workload is a significant fraction of the chip's 275 TOPS capacity.", "It is perfectly balanced, as the arithmetic intensity and ridge point are roughly within a factor of 2x of each other.", "Memory-bound, because its arithmetic intensity (~667 Ops/Byte) is lower than the hardware's ridge point (~1343 Ops/Byte)."], "correct_index": 3}}, {"id": "edge-0264", "title": "The Perception Pipeline's Precision Problem", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What first-order effect will INT8 quantization have on the DRAM write bandwidth for the FP16 activation bottleneck?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It has no significant impact on memory bandwidth, only on compute.", "It reduces the memory bandwidth requirement by 4x.", "It halves the memory bandwidth requirement for the operation.", "It doubles the available hardware memory bandwidth of the Jetson device."], "correct_index": 2}}, {"id": "edge-0266", "title": "The Unstable Perception Queue", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With 30 FPS arrivals and 40 ms processing, how many frames are waiting after one second?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 0}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0 frames", "25 frames", "5 frames", "1 frame"], "correct_index": 2}}, {"id": "edge-0267", "title": "The Cellular Data Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the primary economic cost that Federated Learning is designed to reduce in this edge scenario?", "chain_ids": ["edge-chain-auto-017-03"], "chain_positions": {"edge-chain-auto-017-03": 0}, "chain_tiers": {"edge-chain-auto-017-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Stronger user privacy guarantees", "Lower on-device hardware (CapEx) requirements", "Massive reduction in cellular data transmission costs", "Reduced cloud compute (TFLOPS) cost for training"], "correct_index": 2}}, {"id": "edge-0268", "title": "Identifying Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the standard term for the ratio of compute operations to bytes of data moved from memory, which is the primary determinant of whether a workload is compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["TOPS per Watt", "Operational Throughput", "Arithmetic Intensity", "Compute-to-Memory Ratio"], "correct_index": 2}}, {"id": "edge-0269", "title": "The Orin's Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this specific layer compute-bound or memory-bound on the AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity (800 Ops/Byte) is greater than the Ridge Point.", "Memory-bound, because the model's Arithmetic Intensity (800 Ops/Byte) is less than the Orin's Ridge Point (~1342 Ops/Byte).", "Compute-bound, because 200 GOPS is a very large number of operations.", "Memory-bound, because 250 MB is larger than the Orin's cache."], "correct_index": 1}}, {"id": "edge-0270", "title": "The Edge VRAM Budget: VRAM Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Assuming the model runs in FP16 precision, what is the minimum VRAM required for the model's weights and activations combined?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["60 MB", "150 MB", "210 MB", "180 MB"], "correct_index": 2}}, {"id": "edge-0271", "title": "The INT8 vs FP16 Energy Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much more energy does a single FP16 operation consume compared to an INT8 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x more energy", "~5x more energy", "~18x more energy", "The energy consumption is roughly the same"], "correct_index": 1}}, {"id": "edge-0272", "title": "The INT8 Memory Payoff: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does the storage footprint change, and what is the final size in MB, after quantizing 11M FP16 weights to INT8?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["22 MB", "5.5 MB", "11 MB", "88 MB"], "correct_index": 2}}, {"id": "edge-0274", "title": "The Perception Pipeline's Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is the 30 FPS system safe for a hard 33 ms deadline if the model averages 25 ms per frame?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 1}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it's safe. The average processing time of 25ms is less than the 33ms deadline.", "No, it's unsafe. The 75% utilization is too high, causing queue delays that will violate the hard real-time deadline.", "Yes, it's safe. It can process 40 frames per second (1000ms / 25ms), which is more than the required 30 FPS.", "No, it's unsafe because the system utilization is over 100% (33.3ms / 25ms)."], "correct_index": 1}}, {"id": "edge-0279", "title": "The SRAM Bottleneck", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the memory footprint of the 112x112x256 FP16 activation tensor, and will it fit in 4 MB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.06 MB; it fits. (Misconception: Assumes 1 byte per element/INT8 precision)", "49.0 MB; it does not fit. (Misconception: Confuses bits with bytes, using 16 bytes for FP16)", "6.13 MB; it does not fit.", "6.42 MB; it does not fit. (Misconception: Uses 1,000,000 instead of 1024*1024 for MB conversion)"], "correct_index": 2}}, {"id": "edge-0283", "title": "The Real-Time Perception Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum per-frame latency allowed for a 30 FPS hard real-time perception model?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 1}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 ms", "30 ms", "33 ms", "100 ms"], "correct_index": 2}}, {"id": "edge-0285", "title": "The Myth of Peak TOPS", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a realistic sustained TOPS number you should use for your initial performance estimates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "15 TOPS", "69 TOPS", "4.6 TOPS"], "correct_index": 2}}, {"id": "edge-0288", "title": "The FP16 vs. INT8 Energy Tax", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much more energy does a single FP16 MAC (Multiply-Accumulate) operation consume compared to an INT8 MAC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2× more energy", "Roughly 5× more energy", "Roughly 18× more energy", "The energy difference is negligible"], "correct_index": 1}}, {"id": "edge-0289", "title": "The Edge Quantization Footprint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the on-disk size reduction when quantizing the 50M-parameter FP16 LiDAR model to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The size is reduced by 100 MB.", "The size is reduced by 400 MB.", "The size is reduced by 50 MB.", "There is no change in memory size, only compute speed."], "correct_index": 2}}, {"id": "edge-0291", "title": "The Perception Pipeline's Processing Deficit", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the 40 FPS system with 30 ms processing meet its real-time deadline, and what is the per-frame deficit or surplus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is unstable; it has a 5ms deficit per frame because processing takes longer than the frame arrival interval.", "The system is stable; it has a 5ms surplus per frame.", "The system is stable; 30ms is faster than the standard 33ms (30 FPS) automotive deadline.", "The system is unstable; it has a 10ms deficit per frame (40ms - 30ms)."], "correct_index": 0}}, {"id": "edge-0293", "title": "The Fleet Data Toll", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What privacy benefit does FL provide, and what is the daily centralized upload cost for 10,000 vehicles at 10 GB each and $0.10/GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$1.00 per day. (This is the cost for a single vehicle, failing to account for the fleet size.)", "$50.00 per day. (This is the cost for the Federated Learning approach, not the requested centralized one.)", "$10,000 per day. (Correct: 10,000 vehicles x 10 GB/vehicle x $0.10/GB)", "$10.00 per day. (This calculation incorrectly assumes 10 MB of data per vehicle instead of 10 GB, a common unit error.)"], "correct_index": 2}}, {"id": "edge-0294", "title": "The Quadratic Bottleneck", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of the following operations has a computational complexity (FLOPs) that scales quadratically with the number of input patches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A 3x3 standard convolution", "A self-attention layer", "A ReLU activation function", "A depthwise separable convolution"], "correct_index": 1}}, {"id": "edge-0295", "title": "The Deceptive Pointwise Convolution", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the fundamental hardware reason for this poor performance?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 0}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound and the Orin's TOPS are insufficient.", "The device is thermally throttling due to the high workload.", "The layer is memory-bound due to its low arithmetic intensity.", "The CUDA compiler is generating inefficient machine code for 1x1 convolutions."], "correct_index": 2}}, {"id": "edge-0296", "title": "The Orin's Perception Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the 20 GOPS per 13.1 MB layer compute-bound or memory-bound, and what sustainable TOPS should you expect at 30W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, ~138 TOPS", "Compute-bound, 275 TOPS", "Compute-bound, ~138 TOPS", "Memory-bound, ~102 TOPS"], "correct_index": 2}}, {"id": "edge-0299", "title": "The Activation Bandwidth Bottleneck", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does INT8 activation quantization improve latency, and what are the 256x256x512 tensor footprints in FP16 and INT8?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 0}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It reduces the activation size from 64 MB to 8 MB, saving 56 MB of memory transfers.", "It makes the model's weights 2x smaller on flash, which speeds up initial model loading time.", "It halves the activation size from 64 MB to 32 MB, reducing the memory bandwidth required to read/write it.", "It saves 32 MB of DRAM, which prevents the OS from having to swap memory pages to disk."], "correct_index": 2}}, {"id": "edge-0302", "title": "The Federated Fleet's Primary Directive", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a systems and economics perspective, what is the primary motivation for choosing federated learning in this context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce total computational cost by offloading training from expensive cloud GPUs to the vehicle's edge processors.", "To achieve higher model accuracy than is possible with centralized training by using more diverse, real-world data.", "To massively reduce network bandwidth costs and preserve user privacy by not uploading raw sensor data.", "To enable faster, lower-latency inference decisions on the vehicle."], "correct_index": 2}}, {"id": "edge-0304", "title": "The Vision Transformer Quadratic Burden", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a fixed input image size, what is the fundamental scaling reason that the ViT self-attention layers are often computationally prohibitive on such devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Vision Transformers always have more parameters than CNNs.", "The self-attention mechanism computational cost scales quadratically with the input sequence length.", "Edge NPUs lack the hardware to efficiently perform the matrix multiplications required by Transformers.", "The activation memory required by the Key-Value cache in Transformers is too large for the SRAM on edge devices."], "correct_index": 1}}, {"id": "edge-0305", "title": "The Power Efficiency Litmus Test", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing chips, what does the TOPS/W (Trillion Operations Per Second Per Watt) metric fundamentally allow you to identify?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The maximum theoretical compute throughput of the chip.", "The chip's performance on memory-bound vs. compute-bound tasks.", "The computational performance delivered per unit of power consumed.", "The dollar cost per trillion operations."], "correct_index": 2}}, {"id": "edge-0306", "title": "The Edge Roofline Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Given an arithmetic intensity of 2,000 Ops/Byte, is the model compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because the model's arithmetic intensity (2000) is greater than the Orin's ridge point (~1342 Ops/Byte).", "Memory-bound, because the peak memory bandwidth (204.8 GB/s) is numerically much smaller than the peak compute (275 TOPS).", "Compute-bound, because the model's arithmetic intensity (2000) is greater than the Orin's ridge point (~1342 Ops/Byte).", "It's impossible to tell without knowing the model's latency in milliseconds."], "correct_index": 2}}, {"id": "edge-0310", "title": "The Autonomous Driving Frame Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "At 275 INT8 TOPS and a 33 ms deadline, what is the maximum theoretical number of TeraOps the model can perform per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "8,333 TOPS", "9.075 TeraOps", "9,075 TeraOps"], "correct_index": 2}}, {"id": "edge-0311", "title": "The Economics of Fleet Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary motivation for using FL in this automotive scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To achieve higher model accuracy than centralized training.", "To reduce the on-device compute requirements for inference.", "To reduce data transmission costs and protect user privacy.", "To enable faster model training cycles (wall-clock time)."], "correct_index": 2}}, {"id": "edge-0313", "title": "The Depthwise Separable Cost Advantage", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how much does this change reduce the number of floating-point operations (FLOPs)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About a 3× reduction", "About a 9× reduction", "About a 256× reduction", "About a 2× reduction"], "correct_index": 1}}, {"id": "edge-0317", "title": "The Perception Model's Memory Diet (edge-0317)", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory in MB is saved for the 256x256x128 activation tensor when converting it from FP16 to INT8?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~16.8 MB", "~4.2 MB", "~8.4 MB", "0 MB"]}}, {"id": "edge-0320", "title": "The Federated Learning Bandwidth Diet", "topic": "extreme-quantization", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate size of the FP32 gradient update for a 5 million-parameter model?", "chain_ids": ["edge-chain-auto-secondary-006-05"], "chain_positions": {"edge-chain-auto-secondary-006-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["250 MB", "10 MB", "20 MB", "5 MB"], "correct_index": 2}}, {"id": "edge-0322", "title": "The Depthwise Separable Cost Reduction", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately how much compute do you save by replacing standard 3x3 convolutions with depthwise separable convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 100x reduction.", "Roughly 2x reduction.", "Roughly 9x reduction.", "There is no significant computational reduction."], "correct_index": 2}}, {"id": "edge-0323", "title": "The Thermal Budget Trap", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why might a 60W Jetson AGX Orin be unsuitable for a trunk compute module with a strict 30W power budget?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 275 TOPS performance is too low for a perception model.", "Its memory bandwidth is likely insufficient.", "Its 60W power requirement exceeds the system's 30W budget.", "A GPU is the wrong type of processor for this task."], "correct_index": 2}}, {"id": "edge-0324", "title": "The Jetson Roofline Riddle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a layer with 5 trillion INT8 ops and 10 GB of DRAM reads, is it compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 5 trillion operations is a massive workload that will always saturate the GPU.", "Memory-bound, because its arithmetic intensity of 500 Ops/Byte is below the ridge point of ~1342 Ops/Byte.", "Compute-bound, because 5 TOPS / 275 TOPS = 1.8% utilization, meaning compute is the bottleneck.", "Memory-bound, because the memory access (10 GB) is larger than the compute (5 TOPS)."], "correct_index": 1}}, {"id": "edge-0328", "title": "The Autonomous Driving FPS Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To maintain the 30 FPS rate, what is the absolute maximum inference latency your ML model can have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["33.3 ms", "10 ms", "23.3 ms", "43.3 ms"], "correct_index": 2}}, {"id": "edge-0329", "title": "The Privacy-First Powertrain", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From an ML systems perspective, what is the primary reason to choose Federated Learning in this scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce network bandwidth costs from uploading audio files", "To enable faster, real-time model updates for the entire fleet", "To avoid centralizing sensitive user voice data", "To achieve better model accuracy than centralized training"], "correct_index": 2}}, {"id": "edge-0331", "title": "The Units of Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the units of Arithmetic Intensity, the x-axis of this chart?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["FLOPS (Floating Point Operations Per Second)", "Bytes/Op (Bytes per Operation)", "Ops/Byte (Operations per Byte)", "TOPS/W (Trillion Operations per Second per Watt)"], "correct_index": 2}}, {"id": "edge-0332", "title": "The Edge Accelerator Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Using the official specs—275 TOPS and 204.8 GB/s of memory bandwidth—what is the Ridge Point of the Jetson AGX Orin in Ops/Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4.6 Ops/Byte", "~1.3 Ops/Byte", "~1,342 Ops/Byte", "~0.74 Ops/Byte"], "correct_index": 2}}, {"id": "edge-0334", "title": "The Perception Model's Memory Diet (edge-0334)", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 30 million FP16 parameters, what are the FP16 and INT8 weight footprints and total memory saved after quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The reduction is 90 MB.", "The reduction is 60 MB.", "The reduction is 30 MB.", "The reduction is 15 MB."], "correct_index": 2}}, {"id": "edge-0336", "title": "The Federated Learning Privacy Advantage", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a privacy and data transmission perspective, what is the fundamental advantage of the Federated Learning approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized training saves $5,000/day by avoiding on-device compute costs.", "Federated learning compresses the 80,000 GB/day of raw video into an encrypted cloud format.", "It transmits only 1,000 GB/day of weight updates, keeping all 80,000 GB/day of raw PII on-device.", "Federated learning requires 80x more bandwidth because gradients are larger than raw images."], "correct_index": 2}}, {"id": "edge-0340", "title": "The Edge Memory Diet: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the new INT8 weight-memory footprint for a 15 million-parameter FP16 model after quantization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30 MB", "60 MB", "15 MB", "3.75 MB"]}}, {"id": "edge-0342", "title": "The Hidden Cost of On-Device Training", "topic": "safety-certification", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the most significant new operational cost introduced by choosing the federated learning approach at this scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased server costs for model aggregation.", "The one-time purchase price (CapEx) of the edge processors.", "The aggregate fleet-wide energy consumption for on-device computation.", "The network cost of transmitting model updates."], "correct_index": 2}}, {"id": "edge-0343", "title": "The Federated Fleet TCO: Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 10,000 vehicles, what are the daily and annual cellular costs of uploading 1 GB raw data versus 10 MB FL updates at $2/GB?", "chain_ids": ["edge-chain-auto-017-03"], "chain_positions": {"edge-chain-auto-017-03": 1}, "chain_tiers": {"edge-chain-auto-017-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: $20,000/day, Federated: $20,000/day. The model update is likely the same size as the raw data.", "Centralized: $2/day, Federated: $0.02/day. The costs are negligible.", "Centralized: $20,000/day, Federated: ~$200/day. Federated is ~100x cheaper.", "Centralized: $200,000/day, Federated: $2,000/day. Both are too expensive."], "correct_index": 2}}, {"id": "edge-0345", "title": "The Transformer's Quadratic Curse on Edge", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a fixed input image size, which operation is the primary compute bottleneck in the ViT, and how does its computational cost scale as you increase the number of input image patches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MLP Head, which scales linearly with the embedding dimension.", "Convolutional stem, which scales linearly with the number of pixels.", "Self-attention, which scales quadratically (O(N^2)) with the number of patches.", "Layer normalization, which scales linearly with the number of patches."], "correct_index": 2}}, {"id": "edge-0346", "title": "The Edge Roofline: Calculating the Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Given the device's peak performance of 275 TOPS (INT8) and memory bandwidth of 204.8 GB/s, what is its approximate ridge point?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 1}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.34 Ops/Byte", "~4.6 Ops/Byte", "~1343 Ops/Byte", "~0.74 Bytes/Op"], "correct_index": 2}}, {"id": "edge-0349", "title": "The Depthwise Efficiency Gain", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a 112x112 layer with 64 input channels and 128 output channels, what FLOP reduction do you get from a 3x3 depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["An ~8.4x reduction in FLOPs", "A ~9x reduction in FLOPs", "A ~128x reduction in FLOPs", "A ~2x reduction in FLOPs"], "correct_index": 0}}, {"id": "edge-0350", "title": "The Edge Power Budget", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What typical power draw should you assume for a Hailo-8 edge AI accelerator in the fleet TCO analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30W", "700W", "10mW", "2.5W"], "correct_index": 3}}, {"id": "edge-0352", "title": "Edge Power Efficiency 101", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To make the most of your limited power budget, what is the primary metric you should use to evaluate the power efficiency of the compute?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak TOPS", "Memory Bandwidth (GB/s)", "TOPS/W (Throughput per Watt)", "PUE (Power Usage Effectiveness)"], "correct_index": 2}}, {"id": "edge-0357", "title": "The Federated Learning Transmission Trade-off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In this setup, what is primarily transmitted from the vehicle back to the central server to update the global model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Raw sensor data (e.g., images, LiDAR point clouds).", "Anonymized user driving statistics and routes.", "Model updates (e.g., gradients or weights).", "Real-time inference latency and power consumption metrics."], "correct_index": 2}}, {"id": "edge-0358", "title": "The Federated Fleet Cost Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 10,000 vehicles, what is the daily bandwidth cost of uploading 50 GB raw data per car versus a 50 MB FL update at $0.02/GB?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$100,000 (Centralized) vs. $100 (Federated). A 1000x difference.", "$10,000 (Centralized) vs. $10 (Federated). A 1000x difference.", "$1,000,000 (Centralized) vs. $1,000 (Federated). A 1000x difference.", "$10,000 (Centralized) vs. $1,000 (Federated). A 10x difference."], "correct_index": 1}}, {"id": "edge-0360", "title": "The Activation Memory Budget: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the activation memory footprint for a 128x128x64 tensor in FP16 versus INT8?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 MB (FP16) vs. 0.5 MB (INT8)", "256 KB (FP16) vs. 128 KB (INT8)", "2 MB (FP16) vs. 1 MB (INT8)", "16 MB (FP16) vs. 8 MB (INT8)"], "correct_index": 2}}, {"id": "edge-0364", "title": "The Edge Efficiency Trade-off: Standard vs. Depthwise Convolution", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can you explain the efficiency gain by calculating the approximate reduction in the number of parameters for this single layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The parameter count is reduced by ~2x, as the output channels are double the input.", "The parameter count is reduced by exactly 9x, the size of the 3x3 kernel.", "The parameter count is reduced by ~8.7x.", "The parameter count is reduced by ~256x, because the output channel dimension is factored out of the spatial convolution."], "correct_index": 2}}, {"id": "edge-0368", "title": "The Perception Model Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What trade-off does INT8 quantization introduce, and how much memory do 50 million weights require in FP16 versus INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 MB for FP16, 100 MB for INT8. (Incorrectly treats FP16 as 4 bytes and INT8 as 2 bytes)", "100 MB for FP16, 25 MB for INT8. (Incorrectly applies a 4x reduction factor, common for FP32->INT8)", "100 MB for FP16, 50 MB for INT8.", "50 MB for FP16, 12.5 MB for INT8. (Off by a factor of 2 and then incorrectly applies 4x reduction)"], "correct_index": 2}}, {"id": "edge-0372", "title": "The Depthwise Separable Efficiency Gain (edge-0372)", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For the 256-channel 3x3 layer, how many MACs per output pixel does a depthwise separable version use versus standard, and what is the reduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~256x", "~2x", "~9x", "1x (no change)"], "correct_index": 2}}, {"id": "edge-0373", "title": "The Data Privacy Firewall", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a systems perspective, what is the primary and most fundamental reason to choose Federated Learning in this scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly reduces the amount of computation needed on the vehicle's processor.", "It achieves higher final model accuracy compared to centralized training.", "It keeps sensitive user data on the device, enhancing privacy and radically reducing network costs.", "It allows the vehicle to operate completely offline without any need for a central server."], "correct_index": 2}}, {"id": "edge-0375", "title": "The Efficiency Litmus Test", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the single most important efficiency metric you should use to compare how much compute performance each chip delivers for a given amount of power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak TOPS", "Power Draw (Watts)", "TOPS / Watt", "PUE (Power Usage Effectiveness)"], "correct_index": 2}}, {"id": "edge-0378", "title": "The Orin's Ridge Point: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the Jetson AGX Orin's ridge point from 275 TOPS and 204.8 GB/s, and what does that value represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~168 Ops/Byte", "~1.3 Ops/Byte", "~1,342 Ops/Byte", "~0.74 Bytes/Op"], "correct_index": 2}}, {"id": "edge-0381", "title": "The Data Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total daily data upload volume that the cloud ingest system must be designed to handle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 GB", "~1 TB", "~10 TB", "~100 TB"], "correct_index": 2}}, {"id": "edge-0382", "title": "The Federated ROI Calculation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Given centralized costs of 10 GB/car/month at $0.05/GB plus $20/TB storage versus $200k federated CapEx, what is the break-even time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 months (Assuming $10/TB storage)", "2.4 months (Assuming $5,000 OpEx)", "28.5 months", "The centralized approach is always cheaper"], "correct_index": 2}}, {"id": "edge-0386", "title": "The Residual Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of adding two 512x512x256 INT8 tensors with no caching, and is the operation compute- or memory-bound?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 1}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.0 Ops/Byte; it's memory-bound.", "~0.5 Ops/Byte; it's memory-bound.", "~0.33 Ops/Byte; it's compute-bound.", "~0.33 Ops/Byte; it's memory-bound."], "correct_index": 3}}, {"id": "edge-0387", "title": "The Quantization Energy Tax", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a hardware physics perspective, how much more energy does a single FP32 MAC consume than a single INT8 MAC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["FP32 is ~2-4× more expensive.", "They consume roughly the same amount of energy.", "FP32 is ~18× more expensive.", "INT8 is ~4x more expensive."], "correct_index": 2}}, {"id": "edge-0388", "title": "The Quadratic Cliff", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a 64x64 feature map with 128 channels, how does the compute cost of self-attention compare with a 3x3 convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly equivalent in cost, as they both process the same number of input pixels and have the same channel depth.", "The convolution is more expensive because its 3x3 kernel must scan across the entire feature map.", "The self-attention layer is over 7x more expensive due to its quadratic complexity with respect to the number of pixels.", "The self-attention layer is slightly cheaper because it has fewer parameters than the convolution's pointwise step."], "correct_index": 2}}, {"id": "edge-0389", "title": "The Federated Fleet Cost Equation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single biggest economic advantage of the Federated Learning approach in this scenario?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Lower on-device compute requirements.", "Reduced cloud storage costs for raw data.", "Drastically reduced cellular egress fees.", "Faster model convergence during training."], "correct_index": 2}}, {"id": "edge-0390", "title": "The Federated Fleet Economy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 10,000 vehicles at $0.10/GB, what is the daily data cost of a centralized 1 GB/day upload strategy compared to federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$10/day. This incorrectly uses the federated update size (10,000 * 10MB = 100GB) for the calculation.", "$0.10/day. This calculates the cost for only a single vehicle, not the entire fleet.", "$1000/day. This correctly calculates the total data from the fleet and applies the per-GB cost.", "$100,000/day. This represents a 100x calculation error, perhaps by confusing dollars and cents."], "correct_index": 2}}, {"id": "edge-0392", "title": "The Edge Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much weight memory, in GB, is saved by quantizing a 500 million parameter model from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.5 GB", "1.0 GB", "0.5 GB", "4.0 GB"], "correct_index": 2}}, {"id": "edge-0401", "title": "The Fleet Update TCO: OTA vs. Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the annual data cost for sending a 1 GB full-model OTA versus a 100 MB delta update weekly to 10,000 vehicles at $5/GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Delta Update strategy is cheaper at $26,000 annually, versus $260,000 for the OTA strategy.", "The Delta Update strategy is cheaper at $2.6M annually, versus $26M for the OTA strategy.", "The Delta Update strategy is cheaper at $260k annually, versus $2.6M for the OTA strategy.", "The Delta Update strategy is cheaper at $5,000 annually, versus $50,000 for the OTA strategy."], "correct_index": 2}}, {"id": "edge-0402", "title": "The TOPS Illusion", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What critical distinction is the PM missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0403", "title": "The DLA vs GPU Partition", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is your colleague right?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0404", "title": "The YOLO vs ViT Question", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do you push back against replacing YOLOv8-S with ViT-B/16 on the Jetson Orin NX at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0405", "title": "The Edge Batch Size Paradox", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is this a terrible idea for your use case, and what does the roofline model tell you about batch=1 on edge?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 2}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["\"Batching is essential because it increases NPU utilization. The roofline model shows batch=8 is compute-bound with higher throughput, so we should always use it.\"", "\"Batching introduces 267ms of temporal staleness before inference even begins, which violates the real-time control requirements. The roofline model shows batch=1 has 0.086ms latency, which easily meets the 20ms deadline despite low utilization.\"", "\"Batch=1 is compute-bound, so it's already optimal. Batching will make it memory-bound and increase latency.\"", "\"Batch=8 reduces inference latency to 0.345ms, so it should be used. The 267ms staleness is acceptable for a 1 m/s robot.\""], "correct_index": 1}}, {"id": "edge-0406", "title": "The Fleet Firmware Fragmentation Crisis", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you untangle this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0407", "title": "Roofline Inference Latency on Jetson Orin", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the roofline model, what inference latency do you estimate, and is YOLOv8n compute-bound or memory-bound on the Orin NX?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 2}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.063ms, compute-bound", "0.207ms, compute-bound", "0.207ms, memory-bound", "0.063ms, memory-bound"], "correct_index": 2}}, {"id": "edge-0408", "title": "Power Budget for Multi-Model Edge Pipeline", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 25.5W PoE+, what ML power budget remains, and can all three models run concurrently on the Jetson Orin Nano?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0409", "title": "OTA Update Time for Edge Fleet", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total fleet update time for the 57 MB update across 500 LTE cameras, and what is the bottleneck?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 1}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0411", "title": "The Sparsity Illusion", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the model still slow, and what pruning strategy should you apply to actually get a speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The pruning is not aggressive enough. Increase unstructured sparsity to 90% or higher.", "The model is memory bandwidth-bound. The pruned model needs to be quantized to INT8.", "The hardware can't skip zero-multiplies. Apply structured pruning to remove entire filters, which reduces the dimensions of the weight tensors and thus the total FLOPs.", "The CPU is the bottleneck. The model processing needs to be offloaded to a dedicated DSP."], "correct_index": 2}}, {"id": "edge-0412", "title": "The Invisible Inventory", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of this sudden, targeted failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has drifted due to a change in store lighting, and the 'Glo-Cola' bottle was the first to be affected.", "A recent OTA model update was corrupted, causing it to fail on this specific object class.", "A physical adversarial patch has been placed on the 'Glo-Cola' bottles, making them invisible to the model.", "The camera sensors across the fleet have simultaneously developed a hardware fault that prevents them from seeing the specific red color of the 'Glo-Cola' logo."], "correct_index": 2}}, {"id": "edge-0413", "title": "The Kernel Launch Storm", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most effective first step to solve this performance bottleneck?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 2}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's arithmetic intensity is too low for the hardware; the batch size must be increased to provide more parallel work.", "The model needs to be pruned more aggressively to reduce the total number of FLOPs, as the compute is clearly the bottleneck.", "The system is bottlenecked by kernel launch overhead and DRAM traffic; apply operator fusion to combine sequential operations into single kernels.", "The model is not properly quantized, and using FP16 would provide better hardware mapping on the Tensor Cores."], "correct_index": 2}}, {"id": "edge-0414", "title": "The Real-Time Radar Deadline", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given this data, what is the most direct way to meet the 33ms hard real-time deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has too many G-MACs. Unstructured pruning reduces G-MACs by 80% but does not resolve the 90ms memory stall.", "The accelerator clock speed is insufficient. Overclocking increases TOPS but memory stall remains 90ms.", "The model weight matrix is too large for the on-chip SRAM, causing stalls from memory swapping. Use knowledge distillation to create a smaller student model that fits in SRAM.", "The host CPU is bottlenecking the pipeline. C++ preprocessing saves 5ms but memory stall is still 90ms."], "correct_index": 2}}, {"id": "edge-0415", "title": "The AV Training Bottleneck", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is data parallelism only giving 1.2x speedup, and what parallelism strategy should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The data input pipeline is starving the accelerators. Increase the number of CPU data loader workers.", "The interconnect is insufficient for data parallelism. The hardware is flawed and a faster version (>200 GB/s) is needed.", "The `AllReduce` communication is a serial bottleneck. Switch to Pipeline Parallelism to reduce cross-accelerator data transfer.", "The batch size is too small, leading to low arithmetic intensity. Double the batch size to better saturate the hardware."], "correct_index": 2}}, {"id": "edge-0416", "title": "The Robot's Split Brain", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which interconnect solution will solve the latency problem for the 256 MB transfer, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a 10GbE connection with RDMA to minimize latency.", "Bridge the two modules with an NVLink connector.", "Connect the modules via a PCIe Gen 4 switch.", "Use a standard 10GbE connection between the modules."], "correct_index": 2}}, {"id": "edge-0417", "title": "The Federated TCO Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which strategy has the lower TCO for 1 million cameras at $10/GB data cost, centralized streaming or federated learning, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The centralized approach is cheaper because it saves the company $3 million in upfront hardware costs.", "The federated approach is far cheaper, breaking even on the extra hardware cost in just 30 days due to enormous data transfer savings.", "The federated approach is cheaper, but the breakeven point is over 2 years, making it a risky investment.", "Neither is viable, as the on-device compute power cost for the federated fleet would exceed any potential data savings."], "correct_index": 1}}, {"id": "edge-0420", "title": "The Drone Vision Architecture Debate", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which architecture is the better fit for the Hailo-8, the lightweight ViT or the MobileNetV2-style CNN, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is better because its global self-attention is more powerful than local convolutions.", "The CNN is better because its operations have high spatial locality, matching the dataflow architecture of the accelerator.", "It doesn't matter which model is chosen; the accelerator's compiler will optimize the dataflow for maximum performance.", "The ViT is better because it requires fewer parameters than the CNN."], "correct_index": 1}}, {"id": "edge-0421", "title": "The Fragmented Inference Graph", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What optimization should you apply to get the 45ms model under the 33ms frame deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increase the GPU clock speed and power draw to make the existing kernels run faster.", "Apply structured pruning to the Conv2D layers to reduce the model's total FLOPs.", "Fuse the BatchNorm and Activation layers into their preceding Conv2D operations.", "Convert the entire model to INT8 precision to reduce memory bandwidth pressure."], "correct_index": 2}}, {"id": "edge-0422", "title": "The Drone's 25ms Mystery", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the 67 MB PCIe Gen4 transfer between Orins too slow, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 67 MB payload is too large for the PCIe Gen4 bus, which is saturated. The system requires an interconnect with higher bandwidth like NVLink.", "The LPDDR5 memory on the receiving module (Module B) is the bottleneck; its bandwidth is insufficient to ingest the 67 MB payload from the PCIe bus at speed.", "The transfer is defaulting to a CPU-mediated path instead of using direct peer-to-peer DMA (RDMA), introducing significant OS and driver overhead.", "The system is using a TCP/IP stack over the PCIe bus for communication, and the network protocol overhead is the source of the high latency."], "correct_index": 2}}, {"id": "edge-0423", "title": "The Federated Learning Power Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the incremental energy cost of the FL cameras versus the control group, scaled across a fleet of 10,000 cameras over one year?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0426", "title": "The Hardware-Unaware NAS Penalty", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely reason the NAS-generated model runs slower despite having 20% fewer MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NAS model over-utilized the on-chip SRAM, causing cache misses.", "The NAS algorithm was not run for enough epochs to find a truly optimal model.", "The model's fragmented graph structure prevents operator fusion, incurring significant kernel dispatch overhead on the accelerator.", "The 20% reduction in MACs was not enough to overcome the fixed latency costs of the Hailo-8's architecture."], "correct_index": 2}}, {"id": "edge-0427", "title": "The Sensor Fusion Sync Failure", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a PCIe Gen4 x16 link move the 256 MB feature map within the 33ms frame budget, and how much latency does it add?", "chain_ids": ["edge-chain-auto-secondary-009-21"], "chain_positions": {"edge-chain-auto-secondary-009-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["There must be a driver issue or excessive software overhead; a 32 GB/s link should transfer 256 MB in under 1ms.", "The CPUs orchestrating the DMA transfer are the bottleneck; the PCIe bus itself is not the issue.", "The PCIe bus is the bottleneck. The measured 9ms is close to the theoretical transfer time of ~8.1ms, indicating the link is saturated.", "You should replace the PCIe link with an InfiniBand NDR connection, as it is a higher performance networking fabric."], "correct_index": 2}}, {"id": "edge-0429", "title": "The Perception Model's Performance Ceiling", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary bottleneck keeping the Orin at 20 FPS under the 40W TDP, and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. It has too many operations (2 T-Ops) for the GPU, so we must prune layers or simplify the architecture.", "The Jetson's power management is faulty. It is thermal throttling prematurely, as it should handle 40W without such a large performance drop.", "The model is memory-bandwidth bound. Its low arithmetic intensity means performance is limited by data movement, not compute.", "The bottleneck is the PCIe bus transferring data from the host CPU to the GPU before inference can begin."], "correct_index": 2}}, {"id": "edge-0432", "title": "The Stereo Vision PCIe Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely bottleneck causing the P2P_Transfer stall for the 512x512x256 FP16 feature map over PCIe Gen4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The host CPU is too slow to manage the DMA transfer between the GPUs.", "The InfiniBand network connecting the car's compute unit to the cloud is saturated.", "The GPU compute is insufficient, and the model needs to be quantized or pruned.", "The PCIe bus's peer-to-peer transfer latency is too high for the 134MB feature map, consuming ~8-10ms of the 33ms budget."], "correct_index": 3}}, {"id": "edge-0434", "title": "The AV Perception Latency Puzzle", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the GPU roofline model, what primary bottleneck prevents the 500 G-Ops, 10 GB LiDAR model from reaching 30 FPS on the Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is thermal throttling. The 30W power draw indicates it can't access its full 60W performance budget.", "The model is too computationally complex (500 G-Ops) for the Jetson AGX Orin to execute within the 33ms time budget.", "The model is memory-bound; its low arithmetic intensity (50 Ops/Byte) is below the Orin's ridge point (~1343 Ops/Byte), so performance is limited by memory bandwidth.", "The bottleneck is high software overhead from launching too many small CUDA kernels, which adds significant latency."], "correct_index": 2}}, {"id": "edge-0436", "title": "The Sensor Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What interconnect bottleneck is most likely when moving a 512 MB tensor from GPU A to GPU B within the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server's InfiniBand network card is misconfigured, limiting its bandwidth.", "CPU driver overhead for initiating the async cudaMemcpy call is creating a multi-millisecond scheduling delay.", "The GPUs are communicating over the PCIe bus instead of a direct, high-bandwidth NVLink bridge.", "The cudaMemcpyPeerAsync operation is incorrectly blocking the CPU, which stalls the next perception stage."], "correct_index": 2}}, {"id": "edge-0438", "title": "The Transformer-CNN Resolution Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most effective first step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Downsample the 1920x1080 camera input back to 1280x720 before feeding it to the ViT.", "The Jetson AGX Orin's 204.8 GB/s memory bandwidth is the bottleneck. The model needs to be moved to a platform with faster on-chip memory.", "Replace the Vision Transformer with an efficient CNN architecture (e.g., based on depthwise separable convolutions) and re-profile performance.", "Apply more aggressive INT4 quantization to the ViT. This should provide a ~2x speedup over INT8, which is enough to meet the deadline."], "correct_index": 2}}, {"id": "edge-0439", "title": "The Sensor Fusion Stall", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural bottleneck explains a 34ms GPU-to-GPU transfer for a 33 MB frame when host-to-GPU takes only 1.7ms?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 2}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand connection between the GPUs is saturated.", "The CPU is overloaded and cannot schedule the DMA transfer efficiently.", "The GPUs are connected over the PCIe bus, which has high overhead for peer-to-peer transfers.", "An inefficient CUDA memcpy call is causing software-level stalls."]}}, {"id": "edge-0442", "title": "The Vision Transformer Traffic Jam", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is a CNN a better fit than a ViT for the 33ms camera model on the edge GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT has too many parameters, causing excessive swapping between DRAM and flash storage during inference.", "The model has not been properly quantized to INT8. The FP32 operations are too slow for the hardware.", "The ViT's self-attention mechanism has a low arithmetic intensity, making it memory-bandwidth bound on the architecture.", "The ViT's total FLOPs exceed the 275 TOPS rating of the device, making it compute-bound."], "correct_index": 2}}, {"id": "edge-0443", "title": "The Autonomous Vehicle Interconnect Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can two same-board GPUs still show multi-millisecond transfer stalls, and what interconnect should the platform use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network card connecting the two GPUs is faulty.", "The CPU is bottlenecked scheduling the DMA transfer; upgrade the CPU.", "The GPUs are communicating over the general-purpose PCIe bus, which adds significant protocol overhead. Use NVLink.", "Quantize the model from FP16 to INT8 to cut transfer size."], "correct_index": 2}}, {"id": "edge-0444", "title": "The Federated Fleet's Financial Failure", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most accurate evaluation of the true Total Cost of Ownership (TCO) of the FL driver-alert system, including failure-mode costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The on-device training is consuming too much power, leading to high electricity costs and battery degradation concerns.", "The daily cellular data uploads of model weights are exorbitantly expensive.", "A single model failure event, caused by 'straggler bias' inherent to the FL approach, cost more than all other operational expenses.", "The hardware is inefficient; choosing a lower-power edge accelerator would have significantly reduced costs."], "correct_index": 2}}, {"id": "edge-0445", "title": "The Real-Time Vision Stall", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given this profile, what is the most effective optimization to consistently meet the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Aggressively prune more filters from the convolutional layers to reduce the model's overall FLOPs.", "Increase the GPU's clock frequency to reduce the execution time of each individual kernel.", "Apply operator fusion using a graph compiler like TensorRT to combine sequential layers into single kernels.", "Increase the number of CPU worker threads in the data loading pipeline to feed the GPU faster."], "correct_index": 2}}, {"id": "edge-0446", "title": "The Federated Fleet Bottleneck", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bottleneck is making FL training slow when 50M-parameter gradients are uploaded over 10 Mbps LTE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is compute-bound; the vehicles' accelerators are too slow to compute gradients on a 50M parameter model, causing the server to wait.", "The system is communication-bound; uploading the ~200MB FP32 gradient payload over the 10 Mbps link is the dominant time sink.", "The central server is the bottleneck; it cannot aggregate gradients from 1,000 vehicles simultaneously and requires a more scalable architecture.", "The system is memory-bound; the 50M parameter model is too large for the vehicles' local RAM, causing excessive page swapping to disk."], "correct_index": 1}}, {"id": "edge-0447", "title": "The Self-Driving Stall", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the roofline model, is the 24.4ms Orin perception model compute-bound or memory-bound, and how should you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. To improve performance, we must prune the model to reduce its 4 Trillion Ops.", "The Jetson platform is inadequate. We must switch to a platform with higher memory bandwidth to meet the 20ms deadline.", "The model is memory-bound. We must use operator fusion or more aggressive quantization to improve its arithmetic intensity.", "The bottleneck is power; the Orin is likely being thermally throttled. We should improve the cooling solution."], "correct_index": 2}}, {"id": "edge-0448", "title": "The Driver Monitoring Latency Budget", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Under a 15W budget and 8ms fixed overhead, which model meets the 33ms deadline: Efficient-CNN or Small-ViT?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0450", "title": "The Federated Fleet ROI Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which first-year model update strategy is more cost-effective and secure for the 1M-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized Training, because its annual TCO of ~$194,000 is significantly lower than the Federated Learning strategy.", "Federated Learning, because its annual data transfer cost of $39,600 is far lower than the Centralized approach.", "Federated Learning; although its Year 1 TCO is higher (~$357,600 vs ~$194,000), it's the superior choice because it avoids transmitting sensitive raw user data, reducing privacy and regulatory risk.", "Centralized Training, as the 400TB of raw data can be easily anonymized before upload, which removes the privacy concern and makes the cheaper option viable."], "correct_index": 2}}, {"id": "edge-0453", "title": "The Real-Time Driver Monitoring Dilemma", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you solve this problem and ship both features while staying within a hard 33ms real-time budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Replace the CNN with a more expressive Vision Transformer (ViT) architecture to learn both tasks simultaneously.", "Run Neural Architecture Search (NAS) to automatically find a single, more efficient architecture for both tasks.", "Implement a Mixture-of-Experts (MoE) architecture with a gating network that routes inputs to a specialized 'drowsiness' or 'distraction' expert.", "Apply 8-bit quantization. Since the model is compute-bound, reducing the data size will not help latency."], "correct_index": 2}}, {"id": "edge-0454", "title": "The Autonomous Vehicle Scaling Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 256 MB GPU-to-GPU transfer over 100 Gbps InfiniBand taking 30ms, and what architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand link is saturated. We must upgrade to a 400 Gbps NDR link to increase bandwidth.", "The bottleneck is software overhead in the RDMA drivers. We need to write a custom kernel module to reduce latency.", "The use of an inter-node fabric (InfiniBand) for this tightly-coupled task is the bottleneck. The GPUs must be co-located in a single chassis with an NVLink bridge.", "The PCIe bus on the host machine is saturated by other peripherals. We need to rearrange the PCIe cards."], "correct_index": 2}}, {"id": "edge-0455", "title": "The Over-Budget Fleet Update", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does the proposed federated learning schedule fit the $30,000/month cellular budget, and what is its monthly data cost?", "chain_ids": ["edge-chain-auto-017-03"], "chain_positions": {"edge-chain-auto-017-03": 2}, "chain_tiers": {"edge-chain-auto-017-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$8,000. The plan only requires 1,000 GB of upload data, which is well within budget.", "$1,200. The data transfer per communication round is only 150 GB.", "$48,000. The plan requires 6,000 GB of data transfer per month at $8/GB, exceeding the $30,000 budget.", "$9,600. The plan requires 1,200 GB of data transfer per month."], "correct_index": 2}}, {"id": "edge-0456", "title": "The Autonomous Perception Latency Puzzle", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which parallelism strategy should you use across the two accelerators to get under 33ms, tensor or pipeline parallelism?", "chain_ids": ["edge-chain-auto-secondary-016-09"], "chain_positions": {"edge-chain-auto-secondary-016-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Pipeline parallelism is better, with a latency of 20.5ms.", "Tensor parallelism is better, with an effective latency of 24ms.", "Pipeline parallelism is the only option, but it fails with a latency of 40.5ms.", "Neither strategy works; tensor parallelism is too slow at 34ms because of communication overhead."], "correct_index": 1}}, {"id": "edge-0457", "title": "The ADAS Pipeline Frame Drop", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, is the 22ms custom LiDAR layer compute-bound or memory-bound on the 15W Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; 1.5 G-Ops is too much work for the 15W power profile. Increasing the Orin's power budget to 60W is the only solution.", "The layer's performance is limited by PCIe bus contention with other peripherals in the ADAS system.", "The layer is memory-bound. Its arithmetic intensity of 1.875 Ops/Byte is far below the Orin's roofline, and the 22ms latency is driven by poor memory access patterns given the theoretical minimum time of ~7.8ms to read 800 MB over the 102.4 GB/s bus.", "The layer is compute-bound. The INT8 operations are likely not mapping efficiently to the Orin's Tensor Cores, leading to low TOPS utilization. A custom CUDA kernel is needed."], "correct_index": 2}}, {"id": "edge-0458", "title": "The Drone's Dilemma: Compute vs. Bandwidth", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the INT8 EdgeViT miss the 33ms deadline despite its G-Op count seeming feasible on the Orin?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The EdgeViT's G-Op count is misleading; its self-attention operations are inherently more complex and take more clock cycles per OP on the Jetson's architecture.", "The EdgeViT is likely causing the Jetson to exceed its thermal design power (TDP), forcing the chip to throttle its clock speed and increase latency.", "The EdgeViT has a lower Arithmetic Intensity (AI) due to its memory access patterns, making it memory-bandwidth bound on the Jetson's 205 GB/s DRAM interface.", "The CUDA kernels for Transformer operations are not as mature as CNN kernels, leading to significant software overhead from things like kernel launch latency."], "correct_index": 2}}, {"id": "edge-0459", "title": "The Autonomous Vehicle Interconnect Dilemma", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Should you transfer the 128 MB feature map between the two SoCs over PCIe Gen4 x16 or 100 Gbps Ethernet with RDMA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Choose 100 Gbps Ethernet because RDMA provides a zero-copy transfer that is more efficient than the standard PCIe driver stack.", "Choose 100 Gbps Ethernet because it is the standard for low-latency GPU communication in high-performance clusters.", "Choose PCIe. Its raw bandwidth is higher and its architecture as a direct memory bus results in lower CPU overhead and less latency jitter.", "Neither is sufficient. The true bottleneck is the MIPI camera interface, which cannot supply data fast enough."], "correct_index": 2}}, {"id": "edge-0460", "title": "The Fleet vs. The Cloud: Drowsiness Detection TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which plan is more cost-effective for updating the drowsiness model across 500,000 vehicles, centralized telemetry or federated learning?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Plan A is more economical. The 10 MB weekly updates in Plan B are significantly larger than the video clips, making it more expensive in the long run.", "Plan B is more economical, with an annual cost of ~$24K compared to Plan A's ~$360K, but the difference is modest enough to prefer Plan A for simplicity.", "Plan B is overwhelmingly more economical. At $0.10/GB, it saves ~$336K/year; at enterprise cellular rates ($10/GB), the gap widens to ~$30M/year, making centralized collection financially infeasible.", "Both plans cost roughly the same (~$300K/year). The choice depends on model performance needs, not cost."], "correct_index": 2}}, {"id": "edge-0461", "title": "The ADAS Pipeline Stall", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should the Hailo inference and CPU fusion stages run serially or as a pipeline to meet the 33ms AEB frame deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0462", "title": "The Autonomous Driving Stall", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the root cause of the depthwise convolution bottleneck on the Jetson AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; the Jetson's 275 TOPS is insufficient for the 115.6M operations required.", "The device is power-throttling due to the high operation count, which is reducing the effective TOPS.", "The layer is memory-bound; its arithmetic intensity is far below the hardware's ridge point, so performance is limited by memory bandwidth.", "The bottleneck is the MIPI CSI-2 camera interface, which cannot supply data to the model fast enough."], "correct_index": 2}}, {"id": "edge-0463", "title": "The Edge Roofline Dilemma: CNN vs. ViT", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you decide whether the ViT can replace the optimized CNN while still meeting the 33ms deadline on Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is compute-bound. Its latency will be ~27ms (7.5 T-Ops / 275 TOPS), so it is a feasible replacement.", "The ViT has ~1.8x more Ops, so it will be ~1.8x slower. Its latency will be 24ms * 1.8 = ~43ms, missing the deadline.", "The ViT is memory-bound. Its latency will be ~73ms, violating the 33ms deadline.", "Both models are compute-bound, but the ViT has more Ops. Its latency will be ~27ms which is too close to the deadline, so it is not feasible."], "correct_index": 2}}, {"id": "edge-0464", "title": "The Autonomous Data Jam", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the 25ms CPU-to-GPU cudaMemcpy slowdown despite fast NVLink GPU-to-GPU transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The host CPU is too slow to stage the 10 GB/s data stream, causing a data preparation bottleneck before the transfer begins.", "The NVLink bridge is misconfigured, and the CPU-to-GPU communication is secretly falling back to a slower interconnect path.", "The InfiniBand card used for data logging is saturating the PCIe bus, leaving insufficient bandwidth for the CPU-to-GPU data transfer.", "The CUDA driver has high overhead, adding fixed latency to every transfer operation regardless of PCIe bandwidth."], "correct_index": 2}}, {"id": "edge-0465", "title": "The Fleet-Learning Cost Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which proposal is cheaper for improving rare near-miss recall across 100,000 vehicles, centralized clip collection or federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because the cloud ingress cost of $0.02/GB guarantees a cost fraction 500x cheaper than the cellular reimbursement.", "Federated Learning, because it fully neutralizes the punishing $3M annual manual labeling penalty mandatory in centralized processing.", "Centralized, because avoiding the $50,000 annual baseline OPEX for FL servers ensures centralized maintains higher margins.", "Federated Learning, but the fiscal benefits remain trivial due to the offset between labeling costs and the steep $260k data transfer tax."], "correct_index": 1}}, {"id": "edge-0466", "title": "The Sensor Fusion Bottleneck: 3D Parallelism", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you parallelize the independent camera branches to get the multi-camera model below the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use data parallelism: process frame N on stream 1 and frame N+1 on stream 2 to increase throughput.", "Use tensor parallelism: split the layers of the fusion head to parallelize its computation.", "Use model parallelism: run the front camera backbone and the side camera backbone simultaneously on different streams, then fuse.", "Use pipeline parallelism: assign the front camera backbone to early layers, and the side camera backbone to later layers."], "correct_index": 2}}, {"id": "edge-0467", "title": "The Autonomous Driving Roofline Puzzle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At the 40W power limit, is the LiDAR segmentation model compute-bound or memory-bandwidth-bound on Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. Its arithmetic intensity of 898 Ops/Byte is very high, meaning the 184 TOPS of work is too much.", "The model is primarily limited by thermal throttling. The 40W power budget only allows for ~184 TOPS, completely restricting execution.", "The model is memory-bound. Its arithmetic intensity of 600 Ops/Byte is below the Jetson's power-constrained ridge point of ~898 Ops/Byte, so performance is dictated by memory bandwidth.", "The bottleneck is inefficient kernel execution. The model AI of 600 Ops/Byte implies 100% compute utilization."], "correct_index": 2}}, {"id": "edge-0468", "title": "The Transformer's Traffic Jam", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is MicroViT slower than the CNN on Jetson Orin despite having similar computations?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0469", "title": "The Sensor Fusion Stutter", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on a napkin math calculation, what is the most likely cause of the ~33ms 'dark time' latency overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The OS scheduler overhead for managing two GPUs is causing context switching delays.", "The PCIe bus is saturated; transferring 2 GB at the given 60 GB/s bandwidth takes ~33ms.", "The sensor ingress pipeline is using slow Ethernet, creating a data input bottleneck before the GPUs are even used.", "The model is memory-bound, and the 33ms is due to slow HBM access on the GPUs."], "correct_index": 1}}, {"id": "edge-0470", "title": "The Driver Monitoring TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which update strategy has the lower 2-year TCO for quarterly updates across 100,000 vehicles, centralized or federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because the $10M one-time hardware cost for the fleet is prohibitively expensive.", "Centralized, because the $50,000 cloud training cost is much cheaper than the $100 per-unit BOM increase.", "Federated, because the $10M one-time hardware cost is significantly less than the $16.4M total cost of data uploads and training over 2 years.", "Federated, because performing on-device training consumes less total energy than cloud training."], "correct_index": 2}}, {"id": "edge-0471", "title": "The Autonomous Driving Perception Stall: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, is the 5 trillion-op, 50 GB LiDAR model on Orin compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound; its 5 Trillion operations are too much for the AGX Orin's GPU.", "The device is power-throttling, as the AGX Orin cannot sustain the required TOPS/W in its 15W power mode.", "The model is memory-bound; its low arithmetic intensity means performance is limited by memory bandwidth, not compute.", "The MIPI CSI-2 camera interface is the bottleneck, as it cannot feed frames to the system fast enough."], "correct_index": 2}}, {"id": "edge-0472", "title": "The Autonomous Driving Frame Drop", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you mathematically evaluate this ViT proposal for edge deployment, and what is a superior architectural optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Replace the ResNet-50 backbone with a small Vision Transformer (ViT) of similar GFLOPs.", "Launch a Neural Architecture Search (NAS) to discover an optimal architecture for the edge hardware.", "Replace the standard 3x3 convolutions with depthwise separable convolutions.", "Keep the architecture but apply aggressive INT4 quantization to the convolutional layers."], "correct_index": 2}}, {"id": "edge-0473", "title": "The Autonomous Truck's Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which interconnect—10Gbps Ethernet, PCIe Gen4 x8, or NVLink 4.0—is feasible for the 512 MB transfer within the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use 10 Gbps Automotive Ethernet. It is an industry standard and the simplest to implement.", "Use a PCIe Gen5 switch. The transfer time of ~8ms is a significant portion of the budget but is manageable.", "Redesign to a single board using NVLink 4.0. The ~0.6ms transfer time makes the data transfer overhead negligible.", "Use InfiniBand with RDMA. It is optimized for low-latency datacenter communication and would be the fastest option."], "correct_index": 2}}, {"id": "edge-0474", "title": "The TCO of Privacy on Wheels", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which privacy-preserving strategy should you choose for 500,000 vehicles after comparing 3-year TCO and breach risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0475", "title": "The Autonomous Driving FPS Mystery", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the roofline model, what bottleneck explains 12 FPS despite 95% GPU utilization for the 40 GFLOP, 16 GB LiDAR model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. 95% GPU utilization indicates the compute units are saturated and we need a more powerful GPU to reach 20 FPS.", "The device is thermal throttling, lowering its clock frequency and missing the FPS target.", "The model is memory-bandwidth bound. Its arithmetic intensity is too low for the architecture, causing the GPU to stall while waiting for data.", "The model is PCIe-bandwidth bound. The 16 GB of data cannot be transferred over the bus fast enough to support 20 FPS."], "correct_index": 2}}, {"id": "edge-0476", "title": "The Perception Deadline Miss", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 5M-parameter ViT take 100ms when the 5M-parameter MobileNetV2 detector takes 25ms, despite nearly identical parameter counts?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 2}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT has poor cache locality and is overwhelming the memory bandwidth. We should reduce the model's embedding dimension.", "The 5M parameters in the ViT are stored in FP32, while the CNN's were INT8, leading to a 4x larger memory footprint that is causing swapping to flash storage.", "The ViT's $O(N^2)$ compute scaling in its attention layers creates a massive FLOPs burden compared to the CNN's linear scaling, making it compute-bound. We should use NAS to find an efficient architecture.", "This is a training-serving skew issue. According to Chinchilla scaling laws, the ViT was undertrained, and we need to increase the training dataset size by 20x."], "correct_index": 2}}, {"id": "edge-0477", "title": "The Autonomous Vehicle Perception Stall", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What bottleneck is causing the missing ~20ms and 33% GPU utilization when copying eight 16MB camera streams to the Orin GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU LPDDR5 memory bandwidth is saturated by the vision transformer's attention heads.", "The PCIe bus host-bounce latency (16.2ms) is starving the compute cores.", "The model's 10ms compute time is causing thermal throttling, downclocking the PCIe bus.", "The 128MB transfer directly saturates the InfiniBand networking stack."], "correct_index": 1}}, {"id": "edge-0478", "title": "The Secure Fleet TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is adding cryptographic hash commitments to FL updates economically justified for the 10,000-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the annual operational cost is over $1,369,000, which far exceeds the risk-adjusted loss.", "No, the potential $1,000,000 loss is a 'black swan' event; the certain $13,690 annual energy cost should be avoided.", "Yes, the ~$1,100 annual energy cost is significantly lower than the expected loss from a potential model poisoning attack.", "Yes, but only because the $1,095 cost is less than the $7,300 hardware replacement cost."], "correct_index": 2}}, {"id": "edge-0479", "title": "The Autonomous Driving Perception Bottleneck", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, is the depthwise separable convolution layer on Orin compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound because 500 Million OPs is too much computation for a single layer on an edge device.", "The layer is memory-bound because its arithmetic intensity (10 Ops/Byte) is far below the hardware's ridge point (~1,342 Ops/Byte).", "The layer is thermally limited; the AGX Orin is likely throttling and unable to deliver its peak 275 TOPS.", "The layer is compute-bound because its low arithmetic intensity means it isn't doing enough work per byte to be efficient."], "correct_index": 1}}, {"id": "edge-0480", "title": "The Vision Transformer Deadline Miss", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 640x640 INT8 ViT on Orin, are the self-attention layers compute-bound or memory-bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT's 16 GOPS are too high for the hardware, making it compute-bound. The MobileNet's 0.5 GOPS are a better fit for the available compute.", "The low GPU utilization indicates a CPU bottleneck in the data preprocessing pipeline; the model choice is irrelevant until the data loading is fixed.", "The ViT is memory-bound, confirmed by its low Arithmetic Intensity (100 Ops/Byte). Switching to MobileNet, which moves 10x less data, should reduce latency by ~10x to ~4.5 ms.", "Both models are memory-bound, but the MobileNet has a lower Arithmetic Intensity (~17 Ops/Byte), which means it will be even more memory-bound and thus slower than the ViT."], "correct_index": 1}}, {"id": "edge-0482", "title": "The Federated Fleet's Budget Blowout", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the primary driver of the 5x daily cellular data cost increase in the 10,000-vehicle FL A/B test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 25% increase in model size is the primary cause, as larger binaries always have proportionally higher transfer costs.", "The new model is uploading raw sensor data logs for debugging, which is a standard practice in A/B tests for safety-critical systems.", "The 4x increase in communication frequency is the dominant cost driver, creating a 5x multiplicative effect when combined with the larger model size.", "On-device training for the new model is more compute-intensive, causing the cellular modem to draw more power and transmit more telemetry, leading to higher data charges."], "correct_index": 2}}, {"id": "edge-0484", "title": "The Drone's Dilemma: Transformer vs. CNN Latency", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is the manager correct to suggest halving the patch size to 8x8 to speed up the ViT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The manager is wrong. The quadratic scaling of attention with the increased sequence length will cause a catastrophic increase in latency. The team should use the MobileNet-L.", "The manager is right. Smaller patches can be processed in parallel, and the Jetson Orin's high TOPS count can handle the increased workload within the deadline.", "The ViT has too many parameters. The latency issue can be solved by pruning 50% of the weights, which will cut the 50ms latency in half to 25ms.", "The issue is memory bandwidth, not compute. Using smaller patches will increase cache hits and could potentially reduce latency."], "correct_index": 0}}, {"id": "edge-0485", "title": "The Autonomous Vehicle Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the 512 MB GPU-to-GPU feature-map transfer over 32 GB/s PCIe Gen5 the bottleneck, and would NVLink fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is bottlenecked; it doesn't have enough PCIe lanes to feed both GPUs, causing the stall.", "The problem is the network; InfiniBand is needed to get sensor data to the compute unit faster.", "The 16ms PCIe transfer time consumes nearly 50% of the 33ms real-time budget, making it the bottleneck. A faster interconnect is required.", "The issue is software; the data transfer should be optimized by using RDMA to bypass the CPU."], "correct_index": 2}}, {"id": "edge-0486", "title": "The Federated Fleet Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the projected annual data transmission cost for 1 million vehicles, and is the full rollout economically feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$891k per year. The cost for the 50,000-vehicle pilot is well under budget, so the rollout is approved.", "The primary cost is the on-device power consumption from daily training, not data transfer. The feature is feasible if battery drain is acceptable.", "~$17.8M per year. The feature is not economically viable without significant cost optimization.", "~$48,828 per year. The feature is extremely cheap because federated learning minimizes data transfer compared to sending raw sensor logs."], "correct_index": 2}}, {"id": "edge-0487", "title": "The Autonomous Robot's Power Budget", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you compare the two 3D detection models for performance per watt on the 15W Jetson AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0488", "title": "The Utilization Paradox", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which candidate model meets the 13.3ms latency budget once Orin utilization is accounted for, the CNN or the ViT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B (ViT) is the only choice. Its higher hardware utilization results in a latency of ~11ms, which is within the 13.3ms budget, while the CNN is too slow at ~14.5ms.", "Model A (CNN) is the clear choice. With only 300 G-MACs, it is 2.5x smaller and takes ~2.2ms, easily meeting the deadline.", "Neither model is viable. The CNN will take ~14.5ms and the ViT will take over 30ms. Both fail the 13.3ms budget.", "Both models are viable. The CNN takes ~2.2ms and the ViT takes ~5.4ms, both well within the total 33.3ms budget."], "correct_index": 0}}, {"id": "edge-0490", "title": "The Fleet Update Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which update strategy has a lower annual TCO for fixing e-scooter detection across 100,000 vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Strategy A, because the $5M platform fee for Strategy B is far greater than the $800k annual training cost for Strategy A.", "Strategy A, because uploading raw data allows for more powerful, unbiased centralized models, which justifies the higher but manageable data cost.", "Strategy B, because the annual data transfer cost is dramatically lower, making the total cost (~$5.7M) significantly less than Strategy A (~$73.8M).", "The costs are roughly equivalent; the high license fee of Strategy B cancels out the data savings, so the decision should be based on privacy alone."], "correct_index": 2}}, {"id": "edge-0492", "title": "The Edge Transformer Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should the ResNet-34 replacement be a Vision Transformer or an efficient CNN to maintain 30 FPS on Jetson Orin?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0495", "title": "The Vision Transformer Performance Cliff", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the ViT underperform the MobileNet-style CNN on Orin despite having fewer parameters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT model has a bug causing excessive memory leaks.", "The ViT's total FLOP count (8 GFLOPs) is simply too high for the Jetson AGX Orin's compute capacity.", "The ViT has a much lower Arithmetic Intensity, making it severely memory-bandwidth bound on the Jetson hardware.", "The ViT has fewer parameters, which means it cannot take full advantage of the GPU's parallelism."], "correct_index": 2}}, {"id": "edge-0496", "title": "The Autonomous Vehicle PCIe Stall", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the 42ms camera-to-GPU latency regression, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated. Moving 200MB is too slow and the system requires an upgrade to a wider bus or compression.", "The GPU's HBM3 memory controller is overwhelmed, causing backpressure on the PCIe bus and slowing down the data ingress.", "The data is in pageable host memory, forcing a hidden CPU-bound copy into a pinned staging buffer before the GPU DMA transfer can start.", "The system's NVLink switch is congested with model-parallel traffic, interfering with the PCIe controller and adding queueing delay."], "correct_index": 2}}, {"id": "edge-0497", "title": "The Delivery Robot's Perception Stall", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the LiDAR workload compute-bound or memory-bound, and what should you optimize to reach 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The GPU is at 100% utilization, which means the compute units are fully saturated and are the bottleneck.", "Compute-bound. The model requires 40 G-IOPs, which is a heavy computational load that is challenging for an edge device.", "Memory-bound. The workload's arithmetic intensity (~381 Ops/Byte) is significantly below the hardware ridge point (~1,342 Ops/Byte), so performance is limited by memory bandwidth.", "Memory-bound. The 105 MB parameter footprint exceeds the L2 cache, forcing all operations to spill to SSD storage."], "correct_index": 2}}, {"id": "edge-0498", "title": "The Drone's Dropped Frames", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using arithmetic intensity, why is the ViT bottlenecked on the Orin and why would MobileNetV3 likely be faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is memory-bandwidth-bound due to the low arithmetic intensity of attention; the CNN's higher arithmetic intensity makes it compute-bound and a better fit for the Orin's architecture.", "The GPU is at 100%, so the system is thermally throttling. A new model won't help until we add a bigger heatsink.", "The ViT model is too large and is causing slow data swapping from NVMe storage, which is the bottleneck.", "The model has too many layers. Reducing the depth of the ViT is the only way to meet the latency target, as model family doesn't matter."], "correct_index": 0}}, {"id": "edge-0499", "title": "The Autonomous Vehicle DMA Bounce", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 256 MB GPU-A to GPU-B transfer taking about 20ms instead of the expected 8ms, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen4 x16 bus (32 GB/s) lacks the necessary bandwidth for a 256 MB transfer, which requires an upgrade to PCIe Gen5.", "The transfer is taking 20ms because the data is being routed through the 1 Gigabit Ethernet network switch instead of the PCIe bus.", "The transfer is bouncing through CPU host memory because peer-to-peer DMA is not enabled, effectively doubling the bus traffic and overhead.", "The NVLink bridge connecting the GPUs is faulty, forcing a fallback to the much slower PCIe bus for the transfer."], "correct_index": 2}}, {"id": "edge-0500", "title": "The Fleet Learning TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the first-year TCO of the centralized A/B pipeline versus Federated Learning, and when does FL become cheaper?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The A/B Telemetry approach is more expensive because daily data uploads from 10,000 cars create a higher server load and data ingress cost than weekly uploads from the FL fleet.", "The Federated Learning approach is more expensive, primarily because the total data volume from sending 5MB model updates across 100,000 cars significantly exceeds the telemetry data volume.", "The Federated Learning approach has a much higher TCO in Year 1, driven predominantly by the large, one-time engineering investment required to build the secure aggregation infrastructure.", "The on-device compute for federated training on each car's ECU consumes significant power, making the fleet-wide energy cost the largest component of TCO for the FL approach."], "correct_index": 2}}, {"id": "edge-0502", "title": "The Robot's Perception Power Budget", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which model should you choose under the 15W, 30 FPS budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B is the better choice because its calculated latency (1.5 GB / 204.8 GB/s ≈ 7.3ms) is faster, providing a larger safety margin within the 33ms budget.", "Neither model is viable, as their compute requirements (500-800 GOPS) are too high for an edge device.", "Model A is the only choice because its compute-bound nature makes it far more power-efficient, whereas Model B's memory-bound nature would cause it to exceed the 15W power budget.", "Both models are viable, but Model A is preferable because compute-bound workloads are easier to optimize with techniques like kernel fusion."], "correct_index": 2}}, {"id": "edge-0503", "title": "The Autonomous Driving Latency Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the suggestion to use InfiniBand likely flawed, and what is the actual bottleneck?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The suggestion is correct; InfiniBand's lower latency and higher bandwidth are necessary for real-time sensor streams.", "The board should be redesigned to use NVLink instead of PCIe to connect the sensor card, as NVLink is a faster GPU interconnect.", "The suggestion is wrong because InfiniBand is a datacenter interconnect; the true bottleneck is likely a 'double copy' issue, which should be solved with a zero-copy protocol like GPUDirect RDMA.", "The CPU is too slow to handle the DMA programming for PCIe. The system needs a CPU upgrade to reduce scheduling latency."], "correct_index": 2}}, {"id": "edge-0504", "title": "The Federated Fleet TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which pilot is cheaper on direct data cost, centralized user collection or Federated Learning, and how should you calculate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$2,500. The federated approach is significantly cheaper as the cost is dominated by a single model broadcast.", "$25,000. The federated approach is more expensive because transmitting raw voice data from 10,000 cars is costly.", "$12,500. The federated approach is cheaper as the total cost of transmitting iterative model updates is less than the cost of acquiring centralized data.", "The costs are comparable, so the choice depends on implementation complexity rather than TCO."], "correct_index": 2}}, {"id": "edge-0506", "title": "The Self-Attention Latency Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which of the following strategies would you apply to solve this latency crisis while best preserving model quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply aggressive INT4 quantization to the existing ViT.", "Use Neural Architecture Search (NAS) to find a smaller ViT configuration.", "Replace the ViT with a CNN built using depthwise separable convolutions.", "Implement a Mixture-of-Experts (MoE) layer."], "correct_index": 2}}, {"id": "edge-0508", "title": "The Wake-Word ROI Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 100,000-car wake-word fleet, what is the first-year TCO of the FL plan versus the traditional retraining plan?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The traditional method is cheaper. The FL data transfer costs would exceed $10 million annually.", "The costs are roughly comparable, so the traditional method is better as it guarantees data quality.", "The FL approach is cheaper, with a first-year TCO of $50,520.", "The FL approach is the cheapest option as it only costs $520 for the entire year."], "correct_index": 2}}, {"id": "edge-0510", "title": "The Self-Attention Traffic Jam", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most effective change to meet the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply INT8 quantization to the Vision Transformer and keep the architecture.", "Increase the Jetson Orin's power mode from 15W to 60W to increase TOPS.", "Replace the ViT with a MobileNet-style CNN using depthwise separable convolutions.", "Increase the batch size to better saturate the GPU cores."], "correct_index": 2}}, {"id": "edge-0512", "title": "The Fleet vs. The Cloud: A TCO Showdown", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which daily update strategy has lower operational cost for the 100,000-vehicle fleet, centralized upload or Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Strategy A is cheaper because the cloud compute cost for federated learning will grow unpredictably with more users.", "Strategy B is cheaper, but the main savings come from the lower cloud compute cost.", "Strategy B is cheaper primarily due to the ~50x reduction in cellular data transfer costs.", "The costs are roughly equivalent; the privacy benefits of Strategy B are the only real differentiator."], "correct_index": 2}}, {"id": "edge-0515", "title": "The PCIe Latency Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 64 MB PCIe transfer taking 28ms instead of 8ms, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe bus is sharing bandwidth with the GPU, which is also accessing system DRAM. This contention is saturating the bus.", "The MIPI CSI-2 camera interfaces are the real bottleneck and are failing to deliver data to system memory fast enough.", "The driver is using small, non-coalesced memory transfers, making the operation bound by PCIe's transaction latency rather than its bandwidth.", "The effective sustained bandwidth of the PCIe bus is much lower than its theoretical peak, likely around 2.3 GB/s due to hardware limitations."], "correct_index": 2}}, {"id": "edge-0517", "title": "The Vision Transformer Performance Regression", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the new ViT slower than the old CNN on Jetson Orin despite having 20% fewer G-MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The G-MAC count is misleading; Transformer MACs are less efficient and take longer to execute on the Orin's Tensor Cores.", "The ViT is memory-bandwidth bound; its low arithmetic intensity means the compute units are starved waiting for data from DRAM.", "The model has too many parameters, causing it to be capacity-bound by the Orin's 8-32GB of DRAM.", "The performance drop is due to insufficient quantization; the model needs to be converted to INT4 to increase throughput."], "correct_index": 1}}, {"id": "edge-0518", "title": "The Sensor Fusion Traffic Jam", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 1.5 GB feature-map transfer over PCIe Gen4 x16 fit within a 33ms fusion budget, and what is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is starving the GPU during sensor data ingestion.", "The PCIe bus is the bottleneck.", "The issue is high PCIe protocol overhead.", "NVLink is designed for training, not inference."], "correct_index": 1}}, {"id": "edge-0519", "title": "The Fleet vs. Federated TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the annual data-transfer costs of centralized video upload and federated learning compare for a fleet of 10,000 vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized Learning. The annual cost is only about $27,000, a reasonable price for the high-quality raw data needed for debugging.", "Centralized Learning. The on-device compute required for Federated Learning would add significant thermal load and power consumption, outweighing any data cost savings.", "Federated Learning. The annual data cost is ~$7,300 versus ~$274k for the centralized approach, making it ~40x cheaper while protecting user privacy.", "Federated Learning is not viable. Uploading a 1 MB gradient file over a cellular network will introduce too much latency and miss the vehicle's real-time processing deadlines."], "correct_index": 2}}, {"id": "edge-0520", "title": "The LiDAR Perception Bottleneck: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 1.2 TOps, 4 GB LiDAR model compute-bound or memory-bound on the 30W Orin, and what should you optimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. The 30W power cap enforces thermal throttling, blocking the peak 275 TOPS ceiling.", "The model is CPU-bound. The slow ARM fabric is incapable of staging the point clouds fast enough for the Ampere architecture.", "The model is memory-bound. Its arithmetic intensity of 300 Ops/Byte is massively subordinate to the Orin's 732 Ops/Byte Ridge Point.", "The model is strictly compute-bound. Moving 1.2 Trillion analytical ops forces heavy queueing logic on edge tensor cores."], "correct_index": 2}}, {"id": "edge-0521", "title": "The Self-Driving Stall: Diagnosing a Memory-Bound Vision Transformer", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the ViT missing the 30ms deadline with 25% GPU utilization but near-100% memory-controller load on the edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT's 16 GOPS are too high for the hardware, making it compute-bound. The MobileNet's 0.5 GOPS are a better fit for the available compute.", "The low GPU utilization indicates a CPU bottleneck in the data preprocessing pipeline; the model choice is irrelevant until the data loading is fixed.", "The ViT is memory-bound, confirmed by its low Arithmetic Intensity (100 Ops/Byte). Switching to MobileNet, which moves 10x less data, should reduce latency by ~10x to ~4.5 ms.", "Both models are memory-bound, but the MobileNet has a lower Arithmetic Intensity (~17 Ops/Byte), which means it will be even more memory-bound and thus slower than the ViT."], "correct_index": 2}}, {"id": "edge-0523", "title": "The Automotive Privacy-TCO Tug-of-War", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which drowsiness-training strategy is superior when risk-adjusted TCO is considered, centralized anonymized uploads or Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0525", "title": "The Federated Fine-Tuning Payback Period", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is Federated Learning economically feasible for the 10,000-robot reflective-sign problem compared with cloud collection and manual labeling?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Manual labeling is cheaper. Running training on 1,000 devices for 3 months will incur massive energy and hardware degradation costs.", "Manual labeling is cheaper. A team of 5 annotators can label the required images in a few weeks.", "Federated Learning is drastically cheaper. The total energy cost is less than $100, whereas the manual labeling cost is $10,000 daily.", "Neither is viable. The security risk of a model poisoning attack during Federated Learning is too high for a safety-critical system."], "correct_index": 2}}, {"id": "edge-0527", "title": "The GPU Power Gating Latency", "topic": "safety-certification", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did GPU power gating increase the initial detection latency to 180ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0528", "title": "The Watchdog Timeout Freeze", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What OS-level event stalled your script for over 200ms?", "chain_ids": ["edge-chain-auto-secondary-001-06"], "chain_positions": {"edge-chain-auto-secondary-001-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0529", "title": "The UVC Camera MJPEG CPU Tax", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did saving USB bandwidth destroy your CPU performance and slow down inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0530", "title": "The Preempt-RT Kernel Tick Overhead", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did a real-time kernel destroy your ML throughput?", "chain_ids": ["edge-chain-auto-001-09"], "chain_positions": {"edge-chain-auto-001-09": 1}, "chain_tiers": {"edge-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0532", "title": "The Tracker Addition Budget", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the estimate 20ms + 12ms = 32ms dangerously wrong for fitting within the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0534", "title": "The CAN Bus Bandwidth Crunch", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much effective bandwidth does CAN 2.0B provide, and what share is your 30 Hz ML telemetry consuming?", "chain_ids": ["edge-chain-auto-secondary-009-20"], "chain_positions": {"edge-chain-auto-secondary-009-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0535", "title": "The GPU Driver Crash Recovery", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you maintain perception during a GPU driver crash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "The compiler should always fuse all possible operations; selective fusion never outperforms maximum fusion.", "You need a perception fallback that doesn't depend on the GPU. The Orin has multiple independent compute engines: (1) DLA (Deep Learning Accelerator) — has its own driver stack, independent of the GPU driver. Pre-load a lightweight obstacle detection model (MobileNet-SSD, ~5 MB) on DLA1.", "Update the driver. or \"File a bug with NVIDIA.\" Both are correct long-term actions but don't solve the immediate safety problem for deployed units."], "correct_index": 2}}, {"id": "edge-0538", "title": "The Fleet Firmware Fragmentation", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you handle deploying a new model across this fragmented fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Update all devices to v3.1 first, then deploy the model. This ignores why the fragmentation exists.", "Automatic restart on failure is dangerous for safety-critical systems; manual intervention should always be required.", "The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "The fragmentation exists because OTA updates fail silently in the field."], "correct_index": 3}}, {"id": "edge-0539", "title": "The Rain-Soaked Quantization Cliff", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does quantization amplify the weather-related accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0542", "title": "The Ambarella CV5 Encoding Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Since the model and encoder use separate hardware blocks, what causes the inference latency to spike when the encoder is active?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0543", "title": "The Qualcomm RB5 Hexagon DSP", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can you do better to improve the drone battery life by relocating the ML workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0544", "title": "The Silent NPU Killer", "topic": "safety-certification", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design the system to detect silent NPU failures, ensure functional safety, and implement graceful degradation without external intervention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0545", "title": "The Pruning Paradox on Edge AI Accelerators", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did 80% unstructured pruning make the 3x3 convolution slower on Jetson AGX Orin and drop Tensor Core utilization to near zero?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0546", "title": "The Autonomous Drone's Latency Crisis", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should you use tensor or pipeline parallelism for the 400MB ViT across the two 256MB accelerators, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0548", "title": "The Autonomous Vehicle 'FSDP' Fallacy", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did FSDP/ZeRO-style sharding across two Orins over PCIe Gen4 x8 double latency, and what architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0549", "title": "The Edge Training Scaling Collapse", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does data-parallel fine-tuning of the 2B-parameter model fail to scale across four Orins connected by PCIe Gen4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0553", "title": "The Automotive Parallelism Dilemma", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should you use tensor or pipeline parallelism for the 20B ViT across two Orins over the 40Gbps link, and why?", "chain_ids": ["edge-chain-auto-secondary-016-09"], "chain_positions": {"edge-chain-auto-secondary-016-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-09": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0556", "title": "The Unaligned Memory DMA Fault", "topic": "extreme-quantization", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the exact same DMA code crash on the M0+?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0557", "title": "The Shared Bus Arbitration Lock", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the NPU frame rate drop to 15 FPS when Ethernet traffic spikes, despite theoretical bandwidth sufficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0558", "title": "The USB Power Suspension", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What OS power management feature is causing this massive cold-start penalty on the USB bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0559", "title": "The Dataflow vs GPU Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 4x TOPS advantage only yielding a 1.25x speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Hailo-8 is a dataflow architecture — it maps the entire model graph onto a spatial pipeline of physical compute units.", "The Orin drivers aren't optimized yet.", "The system is healthy and no recovery action is needed.", "Operator fusion primarily reduces compute time."], "correct_index": 0}}, {"id": "edge-0560", "title": "The Resolution-Accuracy Pareto", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What critical information does resizing destroy, and how do you design a system that meets the deadline while preserving long-range detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Resizing 4K to 640x640 destroys small object features (e.g., a person becomes 3 pixels tall), making detection impossible. Use a multi-scale ROI tiling strategy to run detection on high-res crops.", "The accuracy degradation is caused by numerical instability in the framework's matrix multiplication kernels, not by the model or data.", "Resize to 640x640 and accept the accuracy loss. This treats resolution as a single knob when it's actually a spatial information budget.", "The memory leak is in the framework's autograd graph; disabling gradient computation with torch.no_grad() will fix it."], "correct_index": 0}}, {"id": "edge-0561", "title": "The Stereo Depth vs Monocular Trade-off", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the bandwidth, compute, and accuracy trade-offs of replacing stereo with MiDaS-small monocular depth on the TDA4VM forklift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0562", "title": "The Real-Time Scheduling Priority", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why are you still missing deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0563", "title": "The Multi-Resolution Input Strategy", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is switching to 320x320 under thermal throttling the right fix, and how would you design adaptive resolution to preserve safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0566", "title": "The EMC Compliance Nightmare", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the neural network doing that generates RF emissions, and how do you fix it without changing the model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0568", "title": "The Physical Intruder", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What hardware and software mechanisms should be in place to detect and mitigate such an attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0569", "title": "The Early vs Late Fusion Trade-off", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the system-level trade-offs between early and late fusion, and which do you recommend for a Jetson Orin deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0570", "title": "The Edge-Cloud Split Inference", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might splitting the model between edge and cloud be better than either extreme?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0571", "title": "The Dark Silicon Enigma", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What SoC physical limit did you violate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0572", "title": "The TDA4VM Vision Pipeline", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you partition across the heterogeneous cores to guarantee the 33ms deadline?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 4}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run all three models on the MMA accelerator sequentially. Even if the MMA theoretically supports it, moving models that are better suited for DSP/CPU onto the MMA results in poor utilization.", "Software-only health checks are sufficient; a hardware watchdog timer adds unnecessary complexity to the system design.", "Partition across the heterogeneous SoC by workload characteristics, running YOLOv5s on the MMA, Lane detection on the DSP, and Driver monitoring on the CPU.", "Streaming all raw metrics to the cloud in real-time is the most reliable approach because edge-side aggregation risks losing anomalies."], "correct_index": 2}}, {"id": "edge-0574", "title": "The Futile Pruning", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did the massive reduction in FLOPs yield such a disappointing latency improvement, and why is the proposal to 'just prune more' likely to fail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0575", "title": "The EfficientNet Power Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is a model with 3x fewer operations running slower and consuming the same amount of power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0576", "title": "The Headlight Saturation Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did this performance optimization lead to a catastrophic, non-linear failure instead of a graceful degradation in accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0577", "title": "The Perception Pipeline Performance Cliff", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did moving the identical model from 720p to 1080p cause latency to non-linearly explode to 200ms instead of the predicted ~75ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0578", "title": "The Night-Vision Quantization Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is this daytime-only PTQ calibration and validation insufficient, and what catastrophic failure mode should you expect under night-time glare?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0579", "title": "The Night-Vision Quantization Collapse", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What numerical INT8 failure is occurring on the Orin, and why does it disproportionately affect high-contrast night scenes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0580", "title": "The Night-Blind Perception Model", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What failure mode can daytime-only INT8 calibration introduce, despite 25ms latency and 99% daytime accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0582", "title": "The Efficient Transformer Paradox", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can a 500-GOp / 200MB model beat a 100-GOp / 500MB model on a 30W edge SoC despite doing 5× more operations?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 4}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0583", "title": "The Perception Pipeline Fusion Fallacy", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Under what conditions could this optimization paradoxically increase end-to-end latency and cause you to miss the deadline even more severely?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0584", "title": "The Mixed-Precision Backfire", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did INT8 PTQ make the LiDAR detector miss distant objects, and what activation failure occurs at the first INT8 layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0586", "title": "The Distillation Performance Trap", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why could a lower-FLOP custom replacement for ResNet's Conv-BN-ReLU blocks run slower than the 22ms TensorRT baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0587", "title": "The Sensor Fusion PCIe Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What happens if the camera feature map grows from 128MB to 256MB over PCIe Gen4 x8, and why isn't an external InfiniBand link the right fix?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 4}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0588", "title": "The Headlight Glare Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did the 'successful' quantization lead to a catastrophic failure in production, and what is your precise, justified plan to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0589", "title": "The Edge Efficiency Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which candidate should you choose under the 15W Orin budget, and why can the lower-MAC Sparseformer be slower and use more power than the dense CNN?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 4}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0590", "title": "The Catastrophic Night-Drive Quantization Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would an optimization designed to make the model faster cause it to become blind to the most salient objects in a scene, and only under specific conditions?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 2}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0591", "title": "The Self-Defeating Optimization Cascade", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did 50% unstructured pruning and FlashAttention make the 1B VLM slower on Jetson AGX Orin instead of meeting the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0592", "title": "The Headlight Blind Spot", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What calibration mistake can make the INT8 pedestrian detector fail on night-time headlights despite fitting the 15W Orin budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0593", "title": "The Speculative Execution JIT Stall", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would a drafter-verifier design that invokes the ViT only occasionally blow the 33ms deadline with 100-200ms latency spikes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0594", "title": "The Thermal Throttling Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does Model A's performance collapse under thermal throttling while Model B's only degrades slightly, and which model is better for production?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0595", "title": "The Fused Perception Model Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did fusing the dense and sparse models backfire, leading to a slower and less efficient system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0596", "title": "The Headlight Blindness Quantization Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did PTQ pass daytime validation but fail under headlight glare, and what immediate fix beats simply adding night calibration data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0597", "title": "The Night-Blind Edge Model", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is adding headlight-illuminated images to calibration enough, and what non-linear INT8 failure makes the detector night-blind?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0598", "title": "The Fusion Priority Inversion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would operator fusion that improved average performance cause a catastrophic failure in the system's P99 latency?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 4}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0599", "title": "The Sparse Model Fallacy", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why might the team lead's prediction be catastrophically wrong, causing Model B to have higher latency and worse power efficiency in production?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0600", "title": "The Headlight Overflow Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is the system failing catastrophically instead of showing a graceful mAP drop, and what specific steps would you take to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0601", "title": "The Integer Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you construct the integer roofline for YOLOv8 on the Orin DLA, and why might 18 TOPS not mean compute underutilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0602", "title": "The Watchdog Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How did a low-priority background ML task kill the real-time thread?", "chain_ids": ["edge-chain-auto-secondary-001-06"], "chain_positions": {"edge-chain-auto-secondary-001-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0605", "title": "Power-Adaptive Inference System", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a power-adaptive inference system that maximizes overall detection accuracy under these constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0606", "title": "The Drone Fleet Vision Upgrade", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three design decisions, and how do you justify them quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0607", "title": "The Hard Real-Time Factory Defect Detector", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect the 4K crack detector to use the ViT within 10ms, and why is a naive ViT impossible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0608", "title": "The OTA Thermal Brick", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why could the 2:4-pruned TensorRT model's 30% lab latency win disappear on passively cooled Orins in the field?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0609", "title": "The Multi-Sensor Contention Collapse", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the most likely physical bottleneck causing this counter-intuitive result, and how would you redesign the system software architecture to resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0610", "title": "The Headlight Blindness Problem", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What quantization architecture would meet the 33ms budget while avoiding INT8 activation overflow across extreme lighting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0611", "title": "The Autonomous Drone Reflex Gap", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you run the VLM and locust detector within 30W, double the VLM effective rate to 30 FPS, and alert within 100ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0612", "title": "The Jet-Lagged Copilot LLM", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you redesign the 7B offline copilot to fit 32GB, keep TTFT under 500ms, and exceed 30 tokens/s on edge hardware?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Using a larger model with more parameters will improve both accuracy and latency.", "Post-training quantization always produces equivalent results to QAT.", "FlashAttention-2 solves the prefill capacity limit, and INT4 with Speculative Decoding overcomes the memory bandwidth generation limit.", "Engineers unfamiliar with LLM internals focus on the wrong bottleneck."], "correct_index": 2}}, {"id": "edge-0613", "title": "The Autonomous Trucking Perception Upgrade", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does upgrading from 1.2MP to 4MP cameras break the 33ms deadline, and how would you redesign the model for the 45W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0614", "title": "The Autonomous Vision System Fork: ViT vs. Specialized CNN", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which DMS model should you choose for the constrained SoC, ViT or MobileNet-style CNN, and how would you prove it meets the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0616", "title": "The In-Car Assistant Latency Crisis", "topic": "speculative-decoding", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three architectural decisions to meet the latency budget, and how do they combine to solve the problem?", "chain_ids": ["edge-chain-auto-secondary-017-42"], "chain_positions": {"edge-chain-auto-secondary-017-42": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0617", "title": "The Autonomous Perception Deadline Miss", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What multi-stage optimization strategy would bring the 20 TFLOP model below 33ms on Orin, and why in that order?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0618", "title": "The 4D Radar Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What heterogeneous compute architecture would meet the 33ms/60W constraints, and which workloads would you place on each unit?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 5}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0619", "title": "The Autonomous Perception Deadlock", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you choose a perception model for the 30W passively cooled module using effective TOPS/W rather than FLOPs alone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0621", "title": "The Autonomous Valet Shrink Ray", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What ordered optimization plan would make the 5x-too-slow valet Transformer meet 33ms on a 15W ECU, and why in that order?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 5}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0622", "title": "The Predictive Overtake Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which proposal is more appropriate for meeting the 50ms latency budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Assuming that because FlashAttention is effective for training in the cloud, it's automatically the best choice for edge inference. At short sequence lengths typical of real-time edge tasks, the N^2 attention matrix may actually fit in on-chip SRAM, diminishing FlashAttention's advantage.", "The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.", "The low utilization is due to kernel launch overhead on the NPU; fusing all layers into a single kernel will achieve near-peak throughput.", "The correct approach is to first analyze the bottleneck based on the hardware's Roofline model, and then perform a break-even analysis for speculative decoding."], "correct_index": 3}}, {"id": "edge-0623", "title": "The 8K Sensor Upgrade Dilemma", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For 8K/30FPS on the edge SoC, should you launch DenseViT or FastFusion-CNN, and what roofline bottleneck drives the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0624", "title": "The Unified Automotive Perception Stack Design", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What architecture would you choose for the unified 8-camera perception stack under real-time SoC constraints: CNN, ViT, or a hybrid?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0625", "title": "The 360° Vision System Latency Collapse", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What new architecture would meet the 33ms and 60W budgets for eight cameras while leaving headroom for future LiDAR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to move to a single, unified architecture that performs early fusion and is deployed with hardware-aware compilation.", "The latency is dominated by data preprocessing on the CPU; moving to GPU-accelerated preprocessing will resolve the bottleneck.", "Using a larger model with more parameters will improve both accuracy and latency, because larger models have better computational efficiency.", "Prune the existing 8 models more aggressively to eliminate parameter overhead."], "correct_index": 0}}, {"id": "edge-0626", "title": "The Speculative Braking Rationale Failure", "topic": "speculative-decoding", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What three-point plan gets the 1B-parameter rationale generator under 15ms despite autoregressive decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The inference framework is adding unnecessary overhead; switching to a different runtime (e.g., from TensorRT to ONNX Runtime) will fix the issue.", "The most common L5 answer is 'use a much smaller model.' This ignores the product requirement for a certain level of reasoning quality that the 1B model provides. Another incorrect path is 'batch the sensor inputs,' which makes no sense for a single, real-time braking event.", "The core problem is the memory wall of auto-regressive decoding. We hit a latency floor determined by loading the model weights for every single token. The plan must break this sequential dependency.", "The model weights are being duplicated in memory during inference; using model sharding across CPU and GPU will halve the footprint."], "correct_index": 2}}, {"id": "edge-0627", "title": "The Automotive Roofline Dilemma", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which compute architecture would you choose to meet 45W and 30 FPS for the memory-intensive vision model, and why isn't peak TOPS enough?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0628", "title": "The ADAS Look-Ahead Dilemma", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a full-stack system to make a 3B VLM generate 5 tokens within a 33ms perception loop on an Orin already 70% utilized?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 5}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0629", "title": "The Autonomous Perception Stack Fork", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should you choose between the two 30W perception designs, and why is achieved TOPS-per-watt more useful than FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0630", "title": "The Night-Drive Quantization Collapse", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why did the INT8 model fail under headlight glare, and how would mixed precision fix it within the 33ms Orin budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0631", "title": "The Impossible OTA Update: Architecting a Generative VLM for an Automotive SoC", "topic": "ota-firmware-updates", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the first three architectural pillars of your OTA VLM plan, and what napkin-math improvement should each provide?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 4}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0632", "title": "The Autonomous Vehicle Power-Performance Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Would you choose a monolithic GPU or a heterogeneous 30W compute module, and how does power duty cycle drive the decision?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 5}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0633", "title": "The Autonomous Valet Retrofit", "topic": "ota-firmware-updates", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What phased optimization strategy would get the OTA Transformer BEV model from 200ms to 33ms with the least deployment risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0634", "title": "The Autonomous Freeway Thermal Throttling Crisis", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you evaluate and select Gen-2 edge hardware that meets 30 FPS within the 60W passive-cooling budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0635", "title": "The Autonomous Fleet Compute Upgrade", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why is the BEV-Transformer missing the 33ms deadline, and what should you optimize first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0636", "title": "The Autonomous Perception Power Wall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three architectural decisions, and how do you justify them with quantitative reasoning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0637", "title": "The Headlight Flare Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you resolve the HDR INT8 saturation failure and prevent data-dependent quantization failures from recurring?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0638", "title": "The Autonomous Vision Unification Dilemma", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you make a 2B-parameter ViT replacement for the v4 CNN stack fit the 33ms edge latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0639", "title": "The Perception Platform Redesign", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What roadmap would you propose to move beyond the ResNet CNN stack while staying within the 30W and 33ms Orin limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0640", "title": "The Over-Budget Driver Intention Model", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What multi-stage model, algorithm, and hardware optimization pipeline would you use to get a 15x speedup without dropping below 98% accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0641", "title": "The Headlight Blindness Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the most likely physical root cause, and how would you redesign your team's quantization and deployment architecture to prevent this entire class of failures in the future?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 3}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0644", "title": "The Autonomous 'Cloud-to-Edge' VLM Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What is the optimal strategy to deploy this 100B-parameter VLM onto a 16 GB edge platform with a 50ms latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0645", "title": "The Foggy Road Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did INT8 PTQ preserve overall accuracy but fail on foggy pedestrians, and how would you fix it?", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 3}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0649", "title": "The Autonomous Perception Horizon", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Which proposal is viable within the 33ms latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0651", "title": "The Autonomous Perception Stack Redesign: Transformer Systems Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you evaluate whether the 5% mAP ViT can replace the CNN on Orin, and what is wrong with a FLOPs-only analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0653", "title": "The Autonomous Stack Consolidation", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What systematic optimization plan would you use to bring the 200ms Transformer under the <25ms perception budget on the same Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0655", "title": "The Unexpected Cache Miss Storm", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is a common hardware-level reason for such a slowdown in data-intensive loops, even on fast CPUs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0657", "title": "The Bloated INT8 Model", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What explains this discrepancy, and what other components contribute significantly to the overall memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0658", "title": "DRAM Bandwidth for 30 FPS Inference", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Do you have enough DRAM bandwidth to run DeepLabv3-MobileNetV2 at 30 FPS alongside the ISP and display?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, peak demand is 13.3 GB/s (52% of 25.6 GB/s), but overlapping ISP bursts adds ~1ms queuing latency.", "Yes, model weights are 2.1 MB, so at 30 FPS it only uses 63 MB/s, which is <1% of 25.6 GB/s.", "No, the ISP continuous read of 4 GB/s plus the GPU's 13.3 GB/s burst will exceed 25.6 GB/s.", "No, deep segmentation models require at least 32 GB/s of bandwidth due to self-attention activation mapping."], "correct_index": 0}}, {"id": "edge-0659", "title": "Memory for Multi-Camera Tracking", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total memory footprint for the 4-camera pipeline, and is 16 GB sufficient?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 1}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0660", "title": "The DMA Contention Blind Spot", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What physical limitation is causing the simultaneous frame drop and inference latency spike when neither device saturates the bus?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 3}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0661", "title": "The MMIO Sensor Bottleneck", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does the I2C sensor polling rate limit the ML model's input freshness, and why does this bus bottleneck cause the model's accuracy to drop despite the GPU running at full speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0662", "title": "The Data Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is a likely bottleneck, and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0663", "title": "The Swap File Latency Cliff", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Jetson Nano with a 4GB swap file experience erratic 4,000ms latency spikes during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0664", "title": "The eMMC Cold Start", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the first inference take 2.3 seconds longer than the theoretical eMMC transfer time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0665", "title": "The Object Tracking Memory Budget", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is tracking memory truly negligible, and when does it become a bottleneck?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 2}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0666", "title": "The Edge GPU Memory Bandwidth", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the colleague's estimate wrong in both directions — too pessimistic about compute, too optimistic about the real bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The latency is dominated by data preprocessing on the CPU; moving to GPU-accelerated preprocessing will resolve the bottleneck.", "Performance scales linearly with TOPS.. This assumes the workload is compute-bound on both platforms, which is almost never true for the same model on different hardware.", "The colleague's estimate is wrong because both platforms are memory-bound, not compute-bound. YOLOv8-L has high arithmetic intensity (~600 FLOPs/byte), but on both platforms it exceeds the ridge point, making it memory-bandwidth-bound during many layers.", "INT8 quantization reduces model size by 8x compared to FP32, so inference speed should also improve by exactly 8x."], "correct_index": 2}}, {"id": "edge-0667", "title": "The Inference Memory Leak", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where is the 5.4 GB leak coming from over 3 days of continuous inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has a memory leak — switch to a different framework.. The model itself is stateless between inferences.", "The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.", "Multiple subtle sources compound over 3 days: (1) TensorRT engine cache — the TDA4VM's TIDL runtime caches optimized execution plans. Each unique input shape triggers a new cache entry.", "Converting the model to a different framework's format (e.g., ONNX to TFLite) will automatically optimize it for the target hardware."], "correct_index": 2}}, {"id": "edge-0672", "title": "The Repeated Model Loading Memory Leak", "topic": "compound-ai-systems", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's leaking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["\"The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.\"", "\"The model files aren't being freed — add explicit `del model` calls.\"", "\"The pipeline bottleneck is always the largest model; optimizing smaller models in the pipeline has negligible impact on end-to-end latency.\"", "\"The Qualcomm SNPE (Snapdragon Neural Processing Engine) runtime allocates intermediate activation buffers on the Hexagon DSP's shared memory (ION/DMA-BUF allocations) each time a model is loaded. When the model is unloaded, SNPE releases the model weights but doesn't fully release the DSP's scratch memory allocations.\""], "correct_index": 3}}, {"id": "edge-0673", "title": "The Zero-Copy Illusion", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "If the pointer was passed directly, why is there still a latency spike?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0674", "title": "The Multi-Model Memory Sharing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you fit 760 MB of ML workload into 4 GB with room for growth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0675", "title": "The Edge LLM Memory Wall", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What's consuming the memory?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 2}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is leaking memory. or \"Activations are too large.\" The weights are static and activations are small for a single-token decode step.", "The KV-cache. During autoregressive generation, the model stores the key and value tensors for every token generated so far, across every layer. For Phi-3-mini (32 layers, 32 heads, head_dim=96): KV-cache per token = 2 (K+V) × 32 layers × 32 heads × 96 dim × 2 bytes (FP16) = 393 KB per token.", "If the inference process is still running (visible in ps/top), the system is healthy and no recovery action is needed.", "Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count."], "correct_index": 1}}, {"id": "edge-0676", "title": "The Occupancy Grid Map Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the actual memory and bandwidth requirements for the 200m occupancy grid, and when does it become the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Switching to a lower precision format (e.g., INT4) will double the effective compute throughput and solve the utilization gap.", "Deploy the update to all devices simultaneously to minimize the total rollout duration and reduce the window of version inconsistency.", "A 2D grid is simple and cheap.. This ignores the update rate, temporal history, and bandwidth implications of a high-resolution, high-frequency map.", "Grid dimensions: 200m / 0.1m = 2000 cells per axis. Total cells: 2000 × 2000 = 4 million cells. Per-cell storage: 4 + 4 + 1 = 9 bytes."], "correct_index": 3}}, {"id": "edge-0677", "title": "The Brownout Weight Corruption: Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What happened to the model in RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0681", "title": "The Transformer Patch Limit", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the ViT completely crash with an OOM error while the CNN merely slows down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0682", "title": "The LiDAR Point Cloud Memory Explosion", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where is the bottleneck?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 3}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable.", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets.", "The bottleneck is point cloud preprocessing, not model inference. Voxelizing 1.2M points (5 sweeps x 240K) involves random memory scatter operations — each point maps to a random voxel location. CPU-based voxelization achieves only ~5 GB/s effective bandwidth (random access pattern) vs 204.8 GB/s peak, taking ~33ms.", "The model inference is too slow.. PointPillars on an Orin GPU takes ~15ms."], "correct_index": 2}}, {"id": "edge-0683", "title": "The NUMA-Aware Edge AI", "topic": "mlops-lifecycle", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What advanced memory architecture concept might explain this throughput decrease, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0684", "title": "The Shared Bandwidth Bottleneck", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the most probable system-level bottleneck, and how would you redesign the system to mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0685", "title": "The Zero-Copy Nightmare", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What architectural requirements, challenges, and pitfalls determine whether true zero-copy tensor sharing works on this SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0686", "title": "The Fixed-Point Trade-off", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does FP32-to-fixed-point deployment drop mAP from 90% to 65%, and what steps would you take to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0687", "title": "The PTQ vs QAT Question", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do you need two weeks for quantization-aware training instead of a five-minute TensorRT INT8 PTQ pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0688", "title": "The Coral Edge TPU Quantization Constraint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are your options to handle the 4 fallback layers, and which provides the best tradeoff between accuracy and battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0689", "title": "The Night Scene Calibration Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong with the INT8 PTQ calibration?", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 1}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0690", "title": "The INT8 Calibration Drift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong with the quantization, and how do you fix it without retraining?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0691", "title": "INT8 Calibration Set Size vs Accuracy", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many calibration images do you actually need, and what happens if you use too few or too many?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "Use all 50,000 images for calibration — more data is always better.. Calibration is not training.", "Calibration determines the min/max (or percentile) range of activations per layer to set the INT8 scale factors. The key insight: you need enough samples to capture the activation distribution's tails, not to train the model."], "correct_index": 3}}, {"id": "edge-0692", "title": "Quantization Impact on Detection mAP", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which precision should you deploy for 8 camera streams under a 33ms budget, and what latency and throughput do FP32, FP16, and INT8 imply?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 1}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0693", "title": "The QAT Cliff", "topic": "safety-certification", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What went wrong with PTQ, and what is the principled fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0694", "title": "The Disappearing Pedestrian", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the common pitfalls of deploying FP32 models to INT8 hardware, especially concerning robustness, and how would you diagnose and mitigate these issues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0695", "title": "The Mixed-Precision Perception Stack", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What precision would you use for object detection, monocular depth, and motion planning on the Orin, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0696", "title": "The Multi-Core Bottleneck", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you decide where to run the pre-processing stage to minimize overall pipeline latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0697", "title": "The DLA vs GPU Scheduling", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "When does offloading the backbone to a DLA actually improve performance over running everything on the faster GPU?", "chain_ids": ["edge-chain-auto-secondary-008-11"], "chain_positions": {"edge-chain-auto-secondary-008-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Using a larger model with more parameters will improve both accuracy and latency, because larger models have better computational efficiency.", "'Your manager is partially right — for a single model on a plugged-in system, the GPU is faster and simpler. But DLA scheduling matters in three scenarios: (1) Pipeline parallelism — the DLA runs the detection backbone while the GPU simultaneously runs the tracking model.", "The thermal issue is caused by ambient temperature; adding a larger heatsink will fully solve the throttling without any software changes.", "DLA is always better because it's more power-efficient. or GPU is always better because it's faster. Both ignore the scheduling context."], "correct_index": 1}}, {"id": "edge-0698", "title": "The Edge GPU Driver Crash Loop", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's different about these 30 devices, and how do you design the system to tolerate GPU driver crashes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0700", "title": "The Heterogeneous Scheduler's Dilemma", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you design a real-time task scheduler and resource allocator to ensure all critical tasks meet their deadlines while optimizing for power efficiency and overall system utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model should be retrained with fewer parameters to reduce inference time, as model size is the primary driver of latency.", "\"Just put everything on the NPU, it's fastest.\"", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets.", "A sophisticated, multi-level scheduling and resource management approach is required. It involves workload profiling, static vs. dynamic scheduling, hardware-aware task mapping, and resource isolation."], "correct_index": 3}}, {"id": "edge-0701", "title": "The Edge LLM Context Window", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What went wrong with the multi-turn generation latency, and how is memory bandwidth involved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0702", "title": "The Multi-Model Scheduling Problem", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you schedule these across the Orin's GPU, 2 DLAs, and CPU to meet the deadline?", "chain_ids": ["edge-chain-auto-secondary-008-11"], "chain_positions": {"edge-chain-auto-secondary-008-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0703", "title": "Multi-Hardware Model Optimization Pipeline", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a CI/CD pipeline that builds and validates optimized binaries for Orin NX, Hailo-8, and Coral from one PyTorch checkpoint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0704", "title": "The Adaptive Model Diet", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you design a runtime that switches among model variants to meet changing latency, throughput, thermal, and power budgets without human intervention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0705", "title": "The Heterogeneous Choreographer", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition and schedule the multi-stage ML pipeline across CPU, GPU, DSP, and NPU to minimize latency and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0706", "title": "The Functional Safety Redundancy Cost", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an ASIL-D redundant perception path within 50ms without duplicating the GPU, model, and sensor hardware?", "chain_ids": ["edge-chain-auto-secondary-008-02"], "chain_positions": {"edge-chain-auto-secondary-008-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0708", "title": "The NPU Definition", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does NPU stand for, and what mathematical operation is it physically optimized to perform?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Network Processing Unit; optimized for fast Wi-Fi routing.", "Neural Processing Unit; optimized for dense Multiply-Accumulate (MAC) operations.", "Node Partition Unit; optimized for virtualizing the edge operating system.", "Numeric Precision Unit; optimized for high-accuracy 64-bit floating point math."], "correct_index": 1}}, {"id": "edge-0709", "title": "Thermal Throttling on Edge", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What physical constraint is most likely causing this sudden and permanent drop in performance?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 0}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's weights have drifted due to prolonged inference.", "The OS garbage collector is pausing the inference thread.", "The device overheated, causing the OS to drop the clock frequency (thermal throttling).", "The L1 cache has become permanently fragmented."], "correct_index": 2}}, {"id": "edge-0710", "title": "The A/B Partitioning Storage Tax", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What minimum flash storage is required for a 1 MB firmware image with A/B OTA partitions and 100 KB bootloader/OS overhead?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 0}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.1 MB", "~2.1 MB", "~2.0 MB", "Slightly more than 1 MB"]}}, {"id": "edge-0711", "title": "The Hard Real-Time Heartbeat", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a reasonable timeout to set for this watchdog?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 ms", "100 ms", "1 second", "40 ms, the round-trip-time for US cross-country fiber"], "correct_index": 1}}, {"id": "edge-0716", "title": "The Frozen Robot Problem", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hardware mechanism would reboot the robot if its perception software enters an infinite loop and freezes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A software liveness probe that pings a monitoring service.", "Error-Correcting Code (ECC) memory to prevent corruption.", "A hardware watchdog timer that triggers a CPU reset.", "A graceful degradation module that switches to a simpler model."], "correct_index": 2}}, {"id": "edge-0720", "title": "The OTA Flash Budget", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When planning the over-the-air (OTA) update, which of the following is the most fundamental constraint to identify first?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The vehicle's 4G/5G network bandwidth for the download.", "The power consumed by the flash write operation.", "The available storage space in the inactive firmware partition.", "The compute time required to validate the new model post-installation."], "correct_index": 2}}, {"id": "edge-0722", "title": "The OTA Update Budget", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you calculate the FP16 model weight size for 50M parameters, and what is the total download size with a 150 MB container base?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 MB", "100 MB", "250 MB", "350 MB"], "correct_index": 2}}, {"id": "edge-0723", "title": "The OTA Download Fallacy", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long does an 8 GB update take to download over a 1 Gbps cellular connection, ignoring protocol overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 seconds", "4 seconds", "64 seconds", "32 seconds"], "correct_index": 2}}, {"id": "edge-0724", "title": "The Overnight OTA Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long does the 8 GB update take at 10 Mbps, and is the 8-hour overnight window sufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~13.3 minutes", "~1.8 hours", "~2.4 hours", "~14.2 hours"], "correct_index": 1}}, {"id": "edge-0731", "title": "The OTA Update Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does the $2M one-time R&D cost compare to lifetime OTA data costs for 100,000 vehicles over 5 years?", "chain_ids": ["edge-chain-auto-secondary-001-10"], "chain_positions": {"edge-chain-auto-secondary-001-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The R&D cost is $2M, while the data cost is only $400,000 (Trap: calculated 1 year of updates).", "The data cost is $10M, dwarfing the R&D cost (Trap: forgot to convert MB to GB).", "The R&D cost ($2M) and the total data cost ($2M) are exactly equal.", "The costs are negligible because 200MB is small."], "correct_index": 2}}, {"id": "edge-0734", "title": "The OTA Downtime Tax", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum downtime to write the 4 GB model and 32 GB map to UFS 4.0 flash during installation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8.6 seconds", "~17.1 seconds", "~1.9 seconds", "~7.6 seconds"], "correct_index": 1}}, {"id": "edge-0738", "title": "The Automotive OTA Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long will the 2 GB OTA update download take over a stable 40 Mbps cellular connection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~51 seconds", "~16 seconds", "~6.8 minutes", "~2.7 minutes"], "correct_index": 2}}, {"id": "edge-0739", "title": "The OTA Flash Budget Crunch", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the 120 MB vision model update be safely deployed on the 512 MB ECU, and how much flash remains free?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 1}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it fits. The OS and app use 248 MB, leaving 264 MB of free space.", "Yes, it fits. The 120 MB model can be downloaded directly into the 150 MB OTA partition.", "No, it does not fit. After reserving space for the OS and the OTA partition, only 114 MB of flash remains, which is less than the 120 MB required.", "Yes, it fits. After the OS (48 MB) and OTA partition (150 MB) are reserved, there is 314 MB of space for the application."], "correct_index": 2}}, {"id": "edge-0741", "title": "The OTA Bandwidth Trap: OTA & Firmware Updates", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Roughly how long will one vehicle take to download the 8 GB OTA container over a sustained 100 Mbps connection?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 0}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~80 seconds (~1.3 minutes)", "~11 minutes (~640 seconds)", "~1 minute", "~1 hour"], "correct_index": 1}}, {"id": "edge-0742", "title": "The OTA Update Bottleneck", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum OTA download time for one robot to download the 120 MB model over the 10 Mbps cellular link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12 seconds (Calculated Trap)", "96 seconds", "2400 seconds (Calculated Trap)", "9.6 seconds (Calculated Trap)"], "correct_index": 1}}, {"id": "edge-0746", "title": "The Edge Container Overhead", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does Docker's memory overhead (cgroups, overlay2) reduce the available GPU memory for ML model weights and activations, and what does the actual memory budget look like?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0747", "title": "The Bricked OTA Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why are ML model updates significantly more dangerous than generic firmware updates in constrained environments, and how does size dictate partition architecture?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 2}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0748", "title": "The Watchdog Timer", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you detect and recover from a silent TensorRT engine hang when the OS remains responsive?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Implement a hardware watchdog timer — a dedicated hardware peripheral that must be \"kicked\" at regular intervals. If the kick doesn't arrive within the timeout period, the watchdog triggers a hard reset.", "Converting the model to a different framework's format (e.g., ONNX to TFLite) will automatically optimize it for the target hardware.", "Check if the process is running.. The process *is* running — it's blocked inside a CUDA call.", "The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes."], "correct_index": 0}}, {"id": "edge-0749", "title": "The Edge Data Collection Funnel", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should the system decide which images to upload to collect valuable training data without exceeding the 2 GB/month limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0750", "title": "The Zero-Touch Provisioning Pipeline", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is generic provisioning insufficient for 1,000 Coral devices, and what hardware-specific model compilation and calibration must it include?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 0}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0752", "title": "The Gradual Rollout Guru", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design the system to enable this A/B testing, ensuring a smooth rollout and easy rollback if issues arise?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 0}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0753", "title": "The OTA Brick Risk", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is pushing a TensorRT .engine OTA riskier than a generic firmware update, and how must deployment handle model-runtime coupling?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 3}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0754", "title": "The Boot Time Budget", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you cut boot-to-first-detection from 22 seconds to under 3 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0755", "title": "The Resource Tug-of-War", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you ensure the safety model always meets its deadlines without significantly starving the analytics model when both contend for the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0756", "title": "The Edge Model A/B Testing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why doesn't cloud-style A/B testing work on edge, and what's the alternative?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 1}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0757", "title": "The Canary Deployment Gone Wrong", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What went wrong with your canary strategy?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0758", "title": "The Silent Accuracy Drift", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you detect and diagnose accuracy drift without ground truth labels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0760", "title": "Diagnosis: Unrecoverable Boot Loop from Coupled OTA Rollback", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What went wrong with your rollback strategy?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 2}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0762", "title": "The Edge-Cloud Sync Conflict", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you handle the sync, and what happens to the stale devices' inference results in the meantime?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0763", "title": "The Hardware SKU Qualification Matrix", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is this wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0764", "title": "The Inference Audit Trail Gap", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong to cause the deterministic model to produce non-deterministic outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0766", "title": "The CAN Bus Telemetry Flood", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong, and how do you fix it without removing the telemetry?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0767", "title": "The Cellular Diet", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you efficiently manage this deployment without massive overages?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0770", "title": "The Offline-First Edge Design", "topic": "safety-certification", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design the system to operate independently of cloud connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0771", "title": "The Model Versioning Fleet Problem", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many distinct model binaries do you need to maintain, and what's the real operational cost?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 1}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0772", "title": "The Hardware Lifecycle Cliff", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a 3-year transition strategy for 50,000 Jetson TX2 devices reaching end of life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0773", "title": "The Disconnected Brain", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you ensure reliable model updates, send diagnostic telemetry, and maintain local inference capability when the connection drops for extended periods?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0774", "title": "The Fleet Heterogeneity Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you handle the 8x compute gap between the weakest and strongest devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Train one model and compile it for each platform.. A model that runs well on the Orin will OOM or miss deadlines on the Nano.", "The memory leak is in the framework's autograd graph; disabling gradient computation with torch.no_grad() will fix it.", "You need a model tiering strategy — multiple model variants compiled from the same training run, each targeting a hardware tier:", "OTA updates should always include the full model file to ensure atomicity; delta updates risk corrupting the model."], "correct_index": 2}}, {"id": "edge-0775", "title": "The Bandwidth-Constrained Model Update", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ship the 8.1 MB model update without blowing the 500 MB/month cellular data budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0776", "title": "The On-Device Drift Detector", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design an on-device drift detection system for the Hailo-8 quality inspection fleet within the 2.5 W power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0777", "title": "The Predictive Maintenance Model Lifecycle", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's happening, and how do you fix the lifecycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0778", "title": "The Polyglot Fleet", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you efficiently build, test, and deploy ML models across this heterogeneous fleet?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 2}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0779", "title": "The Canary in the Coal Mine", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you implement a safe, phased rollout strategy with robust monitoring to catch issues early?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 2}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0781", "title": "Fleet-Wide Model Drift Detection Threshold", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What statistical threshold should trigger a drift alert, and is the mean confidence drop from 0.82 to 0.71 real drift or normal variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0782", "title": "The Inconspicuous Sticker Attack", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design your edge vision system to detect and mitigate such physical-world adversarial attacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0783", "title": "The Fleet Health Dashboard", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What metrics do you collect, how do you aggregate them, and what are your alerting thresholds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0784", "title": "The Remote Debugging Nightmare", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you diagnose and fix the erratic inference issue remotely over a high-latency, low-bandwidth connection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0785", "title": "The Privacy Guardian", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design an end-to-end system that respects privacy while enabling ML development?", "chain_ids": ["edge-chain-auto-secondary-017-41"], "chain_positions": {"edge-chain-auto-secondary-017-41": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0787", "title": "The Privacy-Preserving Drift Correction", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you fix the model without ever seeing the raw data?", "chain_ids": ["edge-chain-auto-secondary-017-40"], "chain_positions": {"edge-chain-auto-secondary-017-40": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0788", "title": "Self-Healing Edge AI Fleet", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a self-healing system to cut interventions from 150 per week to fewer than 15 without adding SREs?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 3}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count.", "Classify failure modes by frequency and automate top causes. Typical: (1) model OOM/crash (40%) — automated restart + fallback model; (2) connectivity loss (25%) — local buffering + exponential backoff; (3) sensor degradation (20%) — automated recalibration triggers; (4) storage full (10%) — automated log rotation.", "The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes.", "Add better monitoring and alerting.. More alerts without automated remediation just increases alert fatigue."], "correct_index": 1}}, {"id": "edge-0789", "title": "The Watchdog Blind Spot", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why didn't the hardware watchdog trigger a reboot, and how do you design a watchdog system that actually monitors the ML hardware?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 1}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0793", "title": "The 5-Year Edge Device Lifecycle", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the failure modes you must design for, and how do you achieve the availability target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0794", "title": "The Unattended Fleet", "topic": "ota-firmware-updates", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you achieve 'self-healing' and predictive maintenance for both the ML models and the underlying hardware/software stack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0797", "title": "The ISO 26262 Neural Network Problem", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you certify a neural network under ISO 26262?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0798", "title": "The Remote Fleet Update Dilemma", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a secure OTA update system for remote safety-critical edge devices that prevents bricking and supports rollback after dropped connections?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0799", "title": "The Supply Chain Attack", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How could an attacker inject a backdoored model through the supply chain, and how do model-specific integrity checks differ from generic binary attestation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0802", "title": "The Adversarial Patch Attack", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you defend against this adversarial patch attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0803", "title": "The Model IP Leak", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you protect the model's intellectual property and ensure its integrity against reverse engineering or malicious modification on the edge device itself?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0804", "title": "The Model Theft from Edge Device", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do they extract your model, and what can you do to prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable.", "Attack vectors escalate: (1) Unencrypted disk: 500 + 10 min. (2) Encrypted, no HSM: 5K JTAG + 1 week. (3) API distillation: 100K queries = $50K total. To prevent it, use a full defense stack (secure boot + TEE + HSM + rate-limited API).", "The safest strategy is to update firmware and model in a single atomic package to avoid version skew between them.", "Encrypt the model file on disk."], "correct_index": 1}}, {"id": "edge-0805", "title": "The Autonomous Vehicle Compliance Log", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you log everything without impacting real-time inference or filling the onboard 512 GB NVMe in a day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0806", "title": "The Tamper-Proof Model Fortress", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you secure the ML model from manufacturing to runtime so attackers with physical access cannot modify, replace, or exfiltrate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0807", "title": "The Physical Adversarial Gauntlet", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you design the robot's perception and decision-making system to be robust against such 'physical world' adversarial attacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Edge-side health checks should focus on hardware metrics (CPU, memory); model-level metrics like confidence scores are too noisy to be useful.", "Just train with more adversarial examples. While data augmentation helps, physical attacks often exploit subtle sensor-level vulnerabilities or cross-modal discrepancies that simple data augmentation won't cover.", "The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.", "A multi-layered defense strategy is required:\n  1. Multi-Modal Redundancy & Fusion: Don't rely solely on one sensor type. An attack targeting a camera (e.g., sticker on a sign) might not affect Lidar or Radar."], "correct_index": 3}}, {"id": "edge-0808", "title": "The Model Fortress", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Assuming a determined attacker with physical access, how do you protect the model's intellectual property on the device?\n", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0810", "title": "Edge-Cloud Federated Learning System", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a HIPAA-compliant federated learning system for the 500 monitors, including protocol, privacy, convergence, and device budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0811", "title": "The Multicast Model Update Storm", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bandwidth and delivery time do unicast and multicast require for the 150 MB update to 200 Orin nodes, and why does unicast saturate the uplink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0812", "title": "The DLA-GPU Pipeline Overlap", "topic": "3d-parallelism", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the maximum pipeline throughput of this heterogeneous setup compared to sequential execution, and what is the new bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0813", "title": "The CUDA Stream Contention Trap", "topic": "3d-parallelism", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why do the two CUDA-stream models on the Orin serialize, and what fix would achieve at least partial overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0815", "title": "The Fleet OTA Bandwidth Budget", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are the full-update cost and time, the delta-update savings, and the total cost impact if 3% of devices have corrupted base models and require full updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0816", "title": "The Edge Inference Offload Decision", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the local versus offloaded end-to-end latencies, and below what bandwidth does offloading become slower than 15 ms local inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0817", "title": "The Multi-Model Memory Tetris", "topic": "3d-parallelism", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you schedule the 5 models within the 32 GB and 50 ms limits, and what changes when adding a 6th 4 GB, 20 ms model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run all 5 models sequentially: 40+15+8+25+12 = 100ms. This exceeds the deadline, so models must be pruned.", "Run all 5 models concurrently: max(40,15,8,25,12) = 40ms. This meets the deadline and fits comfortably within the 32 GB memory limit.", "Schedule in two phases to respect memory limits. Phase 1: detection, path, gesture (15ms). Phase 2: speech, SLAM. Total is 55ms, so SLAM must be optimized to 25ms (e.g., INT8) to achieve a 40ms total. A 6th model requires re-balancing.", "Only budget model weights (6.9 GB) and run everything concurrently, ignoring activation memory and workspace overhead."], "correct_index": 2}}, {"id": "edge-0818", "title": "The RTSP Stream Bandwidth Saturation", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Calculate total ingest bandwidth, decode throughput required, and determine the bottleneck — is it network, decode, or inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0819", "title": "Evaluating Robust Aggregation in Corrupted Federated Networks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which robust aggregator—Krum, coordinate-wise median, or trimmed mean—should replace FedAvg for 10,000 clients with 2% corrupted gradients?", "chain_ids": ["edge-chain-auto-024-06"], "chain_positions": {"edge-chain-auto-024-06": 2}, "chain_tiers": {"edge-chain-auto-024-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0820", "title": "Diagnosing Model Poisoning in FedAvg", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does FedAvg fail when 50 of 10,000 devices send L2-25 updates, and how should aggregation be modified?", "chain_ids": ["edge-chain-auto-024-06"], "chain_positions": {"edge-chain-auto-024-06": 1}, "chain_tiers": {"edge-chain-auto-024-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0821", "title": "Mitigating Byzantine Poisoning in Federated Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the FedAvg global gradient norm under attack, and what clipping threshold C limits attackers to 10% of the honest magnitude?", "chain_ids": ["edge-chain-auto-024-06"], "chain_positions": {"edge-chain-auto-024-06": 0}, "chain_tiers": {"edge-chain-auto-024-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0822", "title": "Memory Bounds of EWC Mitigations", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory do the EWC terms add, does the 2M-parameter FP16 model fit in 16 MB SRAM, and what should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0823", "title": "Diagnosing On-Device Catastrophic Forgetting", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Diagnose the root cause of the base class forgetting and propose a mitigation strategy that fits within a strict 10MB memory constraint.", "chain_ids": ["edge-chain-auto-001-04"], "chain_positions": {"edge-chain-auto-001-04": 0}, "chain_tiers": {"edge-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0824", "title": "Mitigating Catastrophic Forgetting on Edge", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which mitigation—EWC, a 50-image replay buffer, or LoRA/classifier head—best prevents forgetting on the 4 TOPS, 512 MB smart camera?", "chain_ids": ["edge-chain-auto-001-04"], "chain_positions": {"edge-chain-auto-001-04": 1}, "chain_tiers": {"edge-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0825", "title": "Memory Overhead of EWC on Edge", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total parameter-state memory footprint with EWC and SGD momentum, does it fit in 8 MB SRAM, and how can you make it fit?", "chain_ids": ["edge-chain-auto-025-01"], "chain_positions": {"edge-chain-auto-025-01": 0}, "chain_tiers": {"edge-chain-auto-025-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0826", "title": "Debugging EWC Memory Overheads on MCU", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling EWC make fine-tuning the 25K-parameter wake-word model OOM on 256 KB SRAM, and how would you fix it?", "chain_ids": ["edge-chain-auto-025-01"], "chain_positions": {"edge-chain-auto-025-01": 1}, "chain_tiers": {"edge-chain-auto-025-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0827", "title": "On-Device Continual Learning: EWC vs Replay Buffers", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use EWC or the 10MB replay buffer for this 50M-parameter assistant, and what is the on-device training memory cost?", "chain_ids": ["edge-chain-auto-025-01"], "chain_positions": {"edge-chain-auto-025-01": 2}, "chain_tiers": {"edge-chain-auto-025-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0828", "title": "Edge Experience Replay Buffer Sizing", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many historical frames fit in the 64MB buffer as uncompressed FP32 versus heavily compressed 25KB JPEGs, and which format is structurally superior for this task?", "chain_ids": ["edge-chain-auto-025-02"], "chain_positions": {"edge-chain-auto-025-02": 0}, "chain_tiers": {"edge-chain-auto-025-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0829", "title": "Diagnosing Forgetting in Edge Anomaly Detection", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the day-one bearing anomaly to be forgotten, and what flash-backed replay strategy would you use within the 1GB limit?", "chain_ids": ["edge-chain-auto-025-02"], "chain_positions": {"edge-chain-auto-025-02": 1}, "chain_tiers": {"edge-chain-auto-025-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Relying purely on regularization techniques like Elastic Weight Consolidation (EWC) which introduces significant compute overhead on the edge CPU, or simply freezing the lower layers, which limits adaptation to new anomalies.", "Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count.", "Implement a reservoir-sampled Experience Replay buffer. Allocate 500MB of the available flash storage to maintain a diverse set of historical edge cases.", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets."], "correct_index": 2}}, {"id": "edge-0830", "title": "Latent vs. Raw Experience Replay Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 1MB Flash replay buffer store 2KB raw sensor windows or 256-byte latent embeddings, and what are the quantitative trade-offs?", "chain_ids": ["edge-chain-auto-025-02"], "chain_positions": {"edge-chain-auto-025-02": 2}, "chain_tiers": {"edge-chain-auto-025-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0831", "title": "Debugging SecAgg Stragglers and Metric Bias", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are 85% of SecAgg rounds failing, and why do successful rounds report 12ms latency instead of the 35ms fleet average?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0832", "title": "Designing Privacy-Preserving Fleet-Wide Analytics", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you estimate the global package-detection FPR under LDP while keeping each doorbell under 5KB/day of uplink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0833", "title": "Federated Analytics SecAgg Overhead", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-camera upload and total server ingress for one SecAgg round with 1,000-bin histograms and k=100 neighbors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0834", "title": "Debugging OOM in Edge Device Finetuning", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the OOM during 4096-token LoRA fine-tuning on 16GB memory, and what software mitigation would you use?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0835", "title": "On-Device Fine-Tuning Memory Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With layer-wise gradient checkpointing, what is the new activation memory footprint and percentage increase in total training FLOPs?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0836", "title": "On-Device LLM Fine-Tuning Memory Trade-offs", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you enable gradient checkpointing for the 1.5B LoRA fine-tune on the Orin Nano, or use swap or shorter sequences instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0837", "title": "Federated Gradient Uplink Calculation", "topic": "extreme-quantization", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum gradient quantization bit-width lets the 32M-parameter model upload within 40 seconds at 2 Mbps with 2MB overhead?", "chain_ids": ["edge-chain-auto-secondary-006-05"], "chain_positions": {"edge-chain-auto-secondary-006-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0838", "title": "Evaluating Extreme Gradient Quantization", "topic": "extreme-quantization", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is 4-bit gradient quantization sufficient in the 5-minute, 1 Mbps upload window, or is 1-bit with error feedback required?", "chain_ids": ["edge-chain-auto-secondary-006-05"], "chain_positions": {"edge-chain-auto-secondary-006-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0839", "title": "Diagnosing Federated Quantization Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this failure, and what specific characteristics of the quantized gradients are causing the stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0840", "title": "Diagnosing On-Device LoRA Memory Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does LoRA fine-tuning the 3B model OOM by hitting the 6GB limit, and how would you eliminate the hidden memory bloat?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Assuming that Parameter-Efficient Fine-Tuning (PEFT) inherently guarantees Memory-Efficient Fine-Tuning.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "Diagnose that the root cause is activation memory. Even with frozen base weights, backpropagating gradients to the LoRA adapters requires storing the intermediate forward activations for every layer where an adapter is present.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance."], "correct_index": 2}}, {"id": "edge-0841", "title": "LoRA Memory Footprint Calculation", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much memory is needed for the FP16 7B base, LoRA adapters, and FP32 Adam states for all 3 tasks when adapting W_q and W_v with r=8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0842", "title": "On-Device LoRA Rank Selection Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 7B INT8 LLM on a 32GB Orin, would you choose LoRA rank r=8 or r=64 for W_q and W_v, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0843", "title": "Mitigating Client Drift in Federated Recommendation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you stabilize FedAvg training for the 15MB recommender under extreme non-IID client drift without worsening cellular dropout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0844", "title": "Diagnosing Federated Client Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the production FedAvg loss to diverge after dialect-skewed rounds 15-20, and how would you mitigate the gradient skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0845", "title": "Non-IID Variance Skew in FedAvg", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the variance-contribution ratio of athlete Group B to typical Group A in the FedAvg aggregated gradient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0846", "title": "SRAM Budget for Microcontroller Fine-Tuning", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much SRAM is required for the INT8 base, FP32 layer, gradients, Adam state, and workspace, and does it fit in the 256KB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0847", "title": "Diagnosing OOM in Microcontroller On-Device Fine-Tuning", "topic": "mlops-lifecycle", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the memory exhaustion, and how should the training pipeline be re-architected to fit safely within the SRAM constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0848", "title": "Evaluating On-Device Adaptation Strategies for Mobile Keyboards", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 100M-parameter keyboard model under a 256MB, 10-minute nightly budget, should you use full fine-tuning, LoRA, or a local n-gram cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0849", "title": "Diagnosing Thermal Degradation in On-Device FL Scheduling", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the FL training jobs cause overheating, battery drain, and 40% worse p99 app launch latency despite only running when idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0850", "title": "Evaluating Opportunistic On-Device Training Triggers", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is running a 20-minute, 5W training epoch whenever the screen is off for 5 minutes feasible, and what scheduling conditions would you require?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0851", "title": "Smartwatch Gesture Personalization", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the smartwatch gesture model be fully fine-tuned or adapted with a frozen backbone and 50K-parameter head under the 10MB training budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes.", "Assuming the primary bottleneck for on-device training is compute (FLOPs) rather than the memory required to store intermediate activations during the forward pass for backpropagation.", "Implement a personalization layer architecture. By freezing the MobileNet backbone, it runs in standard inference mode, meaning intermediate activations are discarded immediately. Only the final layer's input activations and gradients are cached.", "The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance."], "correct_index": 2}}, {"id": "edge-0852", "title": "On-Device Personalization Memory Footprint", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What exact byte footprint is needed for the MLP head's weights, gradients, Adam states, and batch activations with batch size 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0853", "title": "Debugging Edge Shadow Mode OOM Crashes", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the wake-word devices hard crash after enabling shadow-mode evaluation alongside the baseline under a 1.5MB SRAM limit?", "chain_ids": ["edge-chain-auto-025-03"], "chain_positions": {"edge-chain-auto-025-03": 0}, "chain_tiers": {"edge-chain-auto-025-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0854", "title": "On-Device Shadow Evaluation Trade-offs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can the AR glasses continuously run both gaze-tracking models on every frame, and what shadow-mode architecture meets 50 FPS and 8MB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0855", "title": "Shadow Evaluation Latency Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum shadow-model evaluation frequency (1 out of every N samples) without dropping 100Hz sensor packets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0856", "title": "Top-K Gradient Sparsity for Constrained Edge Uploads", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum gradient sparsity is required for the 10M-parameter FP32 update to fit the 10-second, 960 kbps upload window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0857", "title": "Diagnosing OOMs in Sparse Federated Learning", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did reducing the network payload cause the edge devices to run out of memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0858", "title": "Evaluating Sparse Gradient Updates on Edge", "topic": "extreme-quantization", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you use Top-k sparse gradients to reduce the 40MB dense FL upload over a 1 Mbps link without hurting convergence?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0859", "title": "SRAM Capacity and Activation Tiling", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much SRAM is needed for the 256x256x32 input, 256x256x64 output, and 3x3 weights, and is tiling required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0860", "title": "SRAM Sizing vs Off-Chip DRAM Power", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you choose 4MB on-chip SRAM or stream 2MB of weights from LPDDR4 each frame, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0861", "title": "Diagnosing SRAM Thrashing in Edge Accelerators", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of this memory anomaly and its associated power spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The power budget should be managed at the application level by adding sleep intervals between inferences.", "Running inference at lower clock speeds increases total energy consumption because the task takes longer.", "SRAM capacity is exceeded by the combined 2.6MB footprint of weights and activations, forcing continuous eviction to external DRAM.", "The external LPDDR4 DRAM is malfunctioning and sending corrupted weights to the SRAM, causing cache misses."], "correct_index": 2}}, {"id": "edge-0862", "title": "Mitigating Catastrophic Forgetting from Extreme Staleness", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does applying the ship's 45-day stale update hurt fleet accuracy, and what reconciliation protocol should replace direct averaging?", "chain_ids": ["edge-chain-auto-001-04"], "chain_positions": {"edge-chain-auto-001-04": 2}, "chain_tiers": {"edge-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0863", "title": "Staleness-Aware Reconciliation in Offline Drones", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is Drone Alpha's discounted aggregation weight after 15 stale rounds, and what total INT8 uplink plus downlink payload is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Weight 0.05, Payload 10MB", "Weight 0.0125, Payload 10MB", "Weight 0.25, Payload 20MB", "Weight 0.0125, Payload 20MB"], "correct_index": 3}}, {"id": "edge-0864", "title": "Maritime LLM LoRA Reconciliation over VSAT", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the ship reconcile its 45-day v1.0 LoRA adaptation with the global v2.0 base over a 2 Mbps link without uploading 500GB of telemetry?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0865", "title": "Depthwise Separable Convolution FLOP Savings", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For an input with 256 channels and 256 output channels at spatial resolution 56x56, what FLOP reduction factor does MobileNet's depthwise separable convolution provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x — depthwise separable only halves the convolution cost.", "~4x — it removes spatial computation but keeps channel computation.", "~8-9x — it factorizes a k² * C operation into k² + C operations.", "~64x — each channel is processed independently with no cross-channel interaction."], "correct_index": 2}}, {"id": "edge-0866", "title": "CNN Translation Equivariance as Hardware Efficiency", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "From a systems perspective, why does a CNN's translation equivariance and local connectivity translate into lower parameter count and FLOPs than a similarly accurate ViT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0867", "title": "The Low-Rank Decomposition Latency Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 8x low-rank FLOP reduction for the 4096x4096 matrix yield only a 15% latency drop on the Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Three factors explain the gap. First, while the parameter reduction is 8× (16.8M → 2.1M), the FLOP reduction is also 8× for matrix-vector products. The original matrix-vector multiply costs 2 \\times 4096^2 = 33.6 MFLOPs.", "Graph-level optimizations are always superior to kernel-level optimizations because they capture global data flow patterns.", "Equating parameter reduction with proportional speedup. 'We have 8x fewer parameters, so we should see ~8x speedup.' This confuses model size with compute time and ignores hardware execution realities.", "The latency is dominated by data preprocessing on the CPU; moving to GPU-accelerated preprocessing will resolve the bottleneck."], "correct_index": 0}}, {"id": "edge-0868", "title": "The Real-Time Compression Stack Design", "topic": "extreme-quantization", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What ordered compression pipeline, target ratios, and accuracy tradeoffs would meet the 50ms/token budget on the Jetson AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0869", "title": "Latency Spikes from NPU Graph Shattering", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "If all 40 convolutions are delegated but latency is 85ms with 100% CPU, what is the likely root cause and fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0870", "title": "OOM from Implicit Runtime Layout Transposes", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 3.5GB attention memory spike despite 120MB theoretical activations, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0871", "title": "NPU Fallback via Dynamic Shapes", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does latency jump from 12ms at 224x224 to 180ms with 100% CPU at 256x256 inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0872", "title": "SRAM Spills in Aggressive Fusion", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did fusing Depthwise Conv + 1x1 Conv on the edge NPU increase latency from 5ms to 18ms and triple DRAM use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0873", "title": "JIT Tracing Memory Leak", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does memory grow by 2MB per inference for variable-length text on the edge CPU, eventually OOMing at 4GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0874", "title": "Emulation Overhead in Fused DSP Ops", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did fusing Conv2D + Swish onto the DSP increase latency from 20ms to 45ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0875", "title": "Constant Folding Binary Bloat", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did constant folding the static Embedding -> Dense block expand the MCU binary from 1.2MB to 3.2MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0876", "title": "Dynamic Control Flow Graph Breakages", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do complex scenes with more than 50 detected objects make the JIT-traced detector spike from 15ms to 85ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0877", "title": "AR Glasses Power Bottleneck", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you quantize the 1.2B ViT to fit 1.5GB and under 1W while avoiding the low-light INT4 PTQ accuracy collapse?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0878", "title": "Smart Home KV Cache Crisis", "topic": "extreme-quantization", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the KV cache so three 4000-token conversations fit in the 2.5GB RAM budget without breaking attention quality?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system should use standard INT8 quantization across all tokens.", "The system requires an asymmetrical, group-wise KV cache quantization architecture.", "The system should offload the KV cache to the hub's NVMe SSD storage.", "Quantizing the entire KV cache to INT4 linearly across all tokens."]}}, {"id": "edge-0879", "title": "Drone Calibration Domain Shift", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does the urban-calibrated INT8 depth model fail in foggy rural areas, and how would you fix it within the 20ms NPU budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0880", "title": "IoT Vibration Dynamic Range", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you preserve 0.01g turbine anomalies on a symmetric INT8 Cortex-M7 NPU with signals ranging up to 100g?", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 4}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Using standard min-max calibration over the whole dataset, which maps the 100g peaks to 127 and completely crushes the critical 0.01g micro-fracture signatures into the 0 bin.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "Standard linear quantization fails here because the dynamic range (10^4) exceeds the representational capacity of linear INT8. To fix this without upgrading the hardware, the system architecture must incorporate non-linear signal compression before NPU execution.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance."], "correct_index": 2}}, {"id": "edge-0881", "title": "Paged Attention Fragmentation Stalls", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does PagedAttention OOM on 8 short 50-token queries with 512-token pages, and what page size would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0882", "title": "Speculative Decoding Acceptance Drop", "topic": "speculative-decoding", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does speculative decoding with K=4 drop from 25 tok/s in chat to 4 tok/s for structured JSON, and how should the runtime react?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable.", "The model should be retrained with fewer parameters to reduce inference time, as model size is the primary driver of latency.", "The root cause is a catastrophic drop in the draft model's token acceptance rate for out-of-domain or highly structured text. Speculative decoding only yields a speedup if the time saved by accepted drafted tokens outweighs the overhead of running the draft model and the verification step.", "Assuming the JSON output is hitting max-token limits or that the target model lacks JSON training data."], "correct_index": 2}}, {"id": "edge-0883", "title": "The Chunked Prefill Starvation", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a 6000-token summarization request freeze other decodes for 850ms, and how would you prevent that jitter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0884", "title": "KV Cache Quantization Cast Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT8 KV-cache quantization halve memory but raise decode latency to 65ms/token on the mobile NPU, and what would you change?", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 3}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The root cause is the lack of native mixed-precision support for Attention on the NPU, leading to massive dequantization overhead.", "Operator fusion primarily reduces compute time by eliminating redundant arithmetic operations between layers.", "Assuming the INT8 memory is simply slower to read than FP16 memory due to alignment issues.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance."], "correct_index": 0}}, {"id": "edge-0885", "title": "Unpadded Batching Inefficiency", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 50-token prompts take 600ms when batched with one 3000-token prompt, and what batching or attention change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0886", "title": "Speculative Decoding KV Cache Leak", "topic": "speculative-decoding", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding OOM after 10 minutes while standard autoregressive decoding runs the full lecture, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The OOM is caused by memory fragmentation in the Python runtime; running garbage collection before inference will prevent it.", "Assuming the draft model's weights are slowly leaking memory over time due to a PyTorch memory management bug.", "Automatic restart on failure is dangerous for safety-critical systems; manual intervention should always be required.", "The root cause is a logical memory leak in how the PagedAttention block table handles rejected speculative tokens. During speculative decoding, the draft model generates K=4 tokens, and their KV caches are allocated in the memory pool so the target model can verify them."], "correct_index": 3}}, {"id": "edge-0887", "title": "Asymmetric Offload PCIe Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is splitting prefill onto the NPU and decode onto the CPU 16.5s, while pure CPU takes only 2.0s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Assuming the NPU is simply too slow at prefill compared to the CPU's vector extensions.", "The root cause is the massive communication overhead of migrating the KV cache between discrete memory spaces. When prefill runs on the NPU, it generates the KV cache for the 2048-token prompt in the NPU's local memory.", "The low utilization is due to kernel launch overhead on the NPU; fusing all layers into a single kernel will achieve near-peak throughput.", "Batch size should be increased to improve throughput, which will proportionally reduce per-frame latency."], "correct_index": 1}}, {"id": "edge-0888", "title": "Fleet OTA Update Strategy", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you roll out the 150MB dashcam model over 1 Mbps 3G while minimizing cellular cost and avoiding costly bootloops?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0889", "title": "Disconnected Drift Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you detect ultrasound data drift within 7 days without uploading 5MB DICOM images over 20 Kbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0890", "title": "Shadow Mode on Tight Edge Memory", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare the new 1.2GB vision model against the production model on the 2GB Jetson Nanos without causing an OOM crash?", "chain_ids": ["edge-chain-auto-025-03"], "chain_positions": {"edge-chain-auto-025-03": 2}, "chain_tiers": {"edge-chain-auto-025-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0891", "title": "Thermal Throttling Runtime Adaptation", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you keep 30 FPS inference reliable when 45°C ambient thermally throttles the NPU from 800MHz to 300MHz?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 4}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0892", "title": "Zero-Connectivity Autonomous Rollback", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you design the reliability layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0893", "title": "Heterogeneous Edge Model Registry", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build the CI/CD pipeline and model registry to update all three hardware generations without exhausting device storage?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 3}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0894", "title": "Skewed Edge A/B Testing", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the HVAC A/B test to measure energy savings despite huge climate and insulation variance across homes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run a standard A/B test but increase the sample size to all 100,000 homes to statistically overpower the variance.", "Use a simple pre/post test on the entire fleet without a control group, assuming historical data is a perfect baseline.", "Running the test longer or randomly assigning more users to smooth out the variance.", "Use Switchback (Crossover) Testing or Stratified Sampling based on edge-computed embeddings. In a switchback design, each individual edge device alternates between Model A (Control) and Model B (Treatment) in randomized blocks (e.g., daily or weekly)."], "correct_index": 3}}, {"id": "edge-0895", "title": "Bandwidth-Aware Edge Debugging", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you collect enough raw frames to debug the 10x intruder false-positive spike without saturating 500 KB/s site uplinks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0900", "title": "Adversarial Traffic on Edge Dragonfly", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Dragonfly GNN workload congest one 100Gbps inter-chassis link while other optical links sit idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Operator fusion primarily reduces compute time by eliminating redundant arithmetic operations between layers.", "Rollback capability is unnecessary if the model was validated in the cloud, since cloud accuracy guarantees transfer to edge devices.", "Blaming the GNN's graph partitioning and attempting to manually re-partition the graph.", "Identify the failure of static minimal routing in a Dragonfly topology. Dragonfly relies on high-radix routers and sparse global links."], "correct_index": 3}}, {"id": "edge-0901", "title": "ECMP Hash Polarization in Edge Storage", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do four 100Gbps NVMe-oF pulls cap at 100Gbps with one link saturated and three links idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0902", "title": "NPU Underutilization in Depthwise Convolutions", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do MobileNetV2 depthwise convolutions use only about 6% of the 16x16 systolic array, and how would you redesign around it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0903", "title": "FPGA DSP Slice Mapping for INT4", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did INT4 quantization congest routing and spike power, and how would you avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0904", "title": "Dataflow Bottleneck in High-Res Drone NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does the Weight-Stationary dataflow make the 4K CNN hit only 12 FPS and max out DRAM power, and what dataflow should early layers use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0905", "title": "MCU SIMD Alignment for Audio CNNs", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the INT8 keyword-spotting CNN using scalar CMSIS-NN kernels on the Cortex-M4, and how would you hit the 50ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0906", "title": "SmartNIC NPU Arithmetic Intensity Mismatch", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the SmartNIC NPU sit at 4% utilization on a batch-1 LSTM DPI model, and what model change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0907", "title": "Edge TPU Spatial Tiling Overhead", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is causing the 4 FPS bottleneck, and how would you change tiling and execution to reach 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The severe bottleneck is the redundant PCIe/USB host-to-device data transfer caused by naive spatial tiling. Because convolutions have receptive fields, splitting an image into spatial tiles requires overlapping 'halos' to prevent edge artifacts. In a deep U-Net, this halo grows exponentially with depth.", "The system should be redesigned to offload inference to the cloud, as edge hardware fundamentally cannot meet the latency and accuracy requirements.", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets.", "Compressing the image data over the USB bus using JPEG, which introduces latency and compression artifacts ruining the super-resolution."], "correct_index": 0}}, {"id": "edge-0908", "title": "Edge GPU LLM KV Cache Thrashing", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing L2 thrashing during decode, and how would you reduce KV-cache bandwidth to improve tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["During auto-regressive decoding, the attention mechanism must read the entire KV cache from DRAM for every single generated token. Even though the 1GB KV cache easily fits in the 64GB unified memory, it is vastly larger than the 8MB L2 cache.", "Quantizing the model weights further to INT4, which won't fix the attention phase bottleneck.", "The low utilization is due to kernel launch overhead on the NPU; fusing all layers into a single kernel will achieve near-peak throughput.", "Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count."], "correct_index": 0}}, {"id": "edge-0909", "title": "Multi-Camera NPU PCIe Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does latency spike despite 40% NPU utilization, and how would you redesign the video pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0910", "title": "Wearable SRAM Weight Pinning", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you modify the 1.2MB keyword-spotting model and memory layout to meet the 5mW smartwatch budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0911", "title": "On-Device LLM KV Cache Spilling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does generation slow at a 2000-token context, and how would you manage the KV cache to avoid OS kills?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 3}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0912", "title": "AR Glasses Memory Fragmentation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you fix the bad_alloc crashes and 3ms tensor allocation latency despite 1.2GB free RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0913", "title": "ADAS SoC L3 Cache Contention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does CPU LiDAR preprocessing raise ViT latency from 15ms to 42ms, and how would you isolate memory contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0914", "title": "Drone Fused Layer Tiling", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why are the accelerator MACs only 18% utilized, and what SRAM tiling strategy would reduce the 200ms frame time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0915", "title": "Smart Camera Zero-Copy Pipeline", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign the ISP-to-NPU pipeline to meet the 33ms frame budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0916", "title": "MobileNet vs VGG NPU Bottleneck", "topic": "roofline-analysis", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which model should you choose to meet 200+ FPS on the 2 TOPS, 4 GB/s NPU, and why?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 3}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0917", "title": "Drone Power-Precision Tradeoff", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which accelerator should run obstacle avoidance to maximize flight time, given the 12 GOPS INT8 NPU model and 1% accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0918", "title": "LLM KV Cache Bandwidth Ceiling", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does decoding drop to 22 tokens/s, and what optimization would improve edge LLM decode throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0919", "title": "Operator Fusion for Arithmetic Intensity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "What optimization would let the Transformer encoder meet the 500 QPS SLA on the 2 TOPS, 4 GB/s NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0920", "title": "Quantization Roofline Shift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the FP32 and INT8 inference times, and how does quantization shift the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantization reduces the memory footprint, shifting the workload from memory-bound to compute-bound, dropping inference time to 1.56ms.", "Quantization shifts the workload from compute-bound to memory-bound, dropping inference time to 1.56ms.", "The bottleneck remains memory-bandwidth, but the 4x reduction in data transfer drops inference time to 0.98ms.", "The inference time will not improve because the total MOPS and peak GOPS remain unchanged."], "correct_index": 0}}, {"id": "edge-0921", "title": "Hardware-Aware Batching for Edge NLP", "topic": "safety-certification", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Given an NPU with 5 TOPS compute and 5 GB/s memory bandwidth, should the 4 audio streams run sequentially or as Batch=4 to meet the 15ms SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Calculating compute time only (40 GOPS / 5 TOPS = 8ms) and assuming both batching strategies will easily meet the 15ms SLA.", "Increasing batch size will shift the workload from memory-bound to compute-bound, fully utilizing the available TOPS.", "Batching alters Arithmetic Intensity. For Batch=1, the 20MB weights are loaded 4 separate times, making the workload highly memory-bound and exceeding the SLA due to 16.8ms cumulative memory transfer latency.", "The OOM is caused by memory fragmentation in the Python runtime; running garbage collection before inference will prevent it."], "correct_index": 2}}, {"id": "edge-0922", "title": "Shared Bus Bottleneck in SoC Pipelines", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Will upgrading the NPU to 2x TOPS stop the 30 FPS frame drops, and what should be optimized instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0923", "title": "SRAM vs DRAM Energy Roofline", "topic": "safety-certification", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the 2MB Transformer model fit the 5mW power budget at 100 Hz, and what dominates the energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0924", "title": "Fleet-wide Rollback under Bandwidth Constraints", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you detect the 2-hour latency creep and safely roll back 50,000 dashcams without saturating cellular uplinks?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 3}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0925", "title": "Ultra-Low Bandwidth Drift Detection", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What observability payload would monitor seasonal data drift within the 10 KB/s satellite uplink limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0926", "title": "Debugging Thermal Throttling Cascades", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you observe thermal-throttle jitter on the glasses without adding polling heat or missing the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0927", "title": "Safe A/B Testing on Storage-Constrained Edge", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you A/B test V2 across 500,000 vacuums when V1 and V2 cannot both fit in 16MB flash?", "chain_ids": ["edge-chain-auto-025-03"], "chain_positions": {"edge-chain-auto-025-03": 1}, "chain_tiers": {"edge-chain-auto-025-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0928", "title": "Tracing Latency in Hybrid Edge-Cloud Pipelines", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you add distributed tracing to isolate the 1200ms P99 bottleneck with under 2ms edge-path overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0929", "title": "Federated Learning Energy Optimization", "topic": "federated-learning", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which federated learning schedule uses less total energy per device to converge, 1 epoch for 200 rounds or 5 epochs for 80 rounds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0930", "title": "AR Glasses Thermal Management via DVFS", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the headset use race-to-sleep or DVFS at 0.6 GHz and 0.6V to meet the 1.2W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0931", "title": "Smart Camera Carbon Payback Period", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the carbon payback period for adding the 20 kg CO2e edge AI modules to the 10,000 traffic cameras?", "chain_ids": ["edge-chain-auto-secondary-017-53"], "chain_positions": {"edge-chain-auto-secondary-017-53": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-53": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0932", "title": "Autonomous Vehicle Dynamic Power Capping", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 10% frequency and voltage reduction on the 3 perception nodes absorb the 65W planning spike while maintaining 30 FPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0933", "title": "Drone Edge Compute vs Flight Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which precision, FP16 or INT8, maximizes total powerline-inspection distance for the 150 Wh drone?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0934", "title": "Retail Edge vs Cloud LLM TCO", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which option has lower 3-year TCO per store for 20,000 tokens/day, cloud API serving ($0.50 per 1M tokens) or a local edge server ($1,500 CapEx, 150W continuous, $0.12/kWh)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0935", "title": "Mobile Speculative Decoding Energy Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the Joules-per-token cost of standard generation versus speculative decoding on the smartphone LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0936", "title": "IoT Edge TPU Sleep State Optimization", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which sleep state, clock-gated or power-gated, minimizes average energy per 200ms anomaly-detection cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0937", "title": "Energy Optimization Strategy Selection", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which saves more energy on a chip with 20 pJ/byte DRAM and 0.5 pJ/FLOP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0938", "title": "Receptive Field vs Model Size Trade-off for Edge Deployment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which achieves the best RF within budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0939", "title": "Designing Fallback for an Autonomous System", "topic": "data-efficiency-selection", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy is best for a safety-critical system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0940", "title": "Edge New 0001", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the latency and throughput tradeoff between batch size 1 and batch size 8 for autoregressive decoding on the T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0941", "title": "Edge New 0002", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which layer should you choose for the 4K 30 FPS vision model, standard CNN or depthwise, under the roofline tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0942", "title": "Edge New 0003", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the 100KB FP32 weight matrix be streamed from Flash or preloaded into SRAM at a 10Hz sampling rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0943", "title": "Edge New 0004", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How will FPS scale when the video model moves from 1080p to 4K on the T4, and what bottleneck dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0944", "title": "Sequence Length Scaling Bottlenecks in Edge Vision Transformers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does increasing ViT patch sequence length from 256 to 1024 shift the execution bottleneck on Orin?", "chain_ids": ["edge-chain-auto-017-12"], "chain_positions": {"edge-chain-auto-017-12": 1}, "chain_tiers": {"edge-chain-auto-017-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0945", "title": "Edge New 0006", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should the 14B LLM use INT8 or INT4 on the 16GB T4, considering memory bandwidth, KV cache, and generation speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0946", "title": "Edge New 0007", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is INT8 still faster than FP32 on the Cortex-M4 despite 4 extra unpack cycles per weight?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0947", "title": "Edge New 0008", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What execution tradeoffs should you expect from applying 2:4 sparsity with INT8 on Orin given 200GB/s memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0948", "title": "Dynamic FP16 vs Static INT8 on T4 GPU", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 1 million requests/day on a T4 edge server, should you deploy dynamic FP16 or calibrated static INT8, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0949", "title": "Edge New 0010", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should the 500KB Cortex-M4 model use 50% unstructured pruning or INT8 quantization to fit 256KB SRAM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0950", "title": "Edge New 0011", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For 4 synchronized 1080p 30 FPS streams on Orin, should you batch all frames or run 4 concurrent single-batch streams?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 3}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0951", "title": "Edge New 0012", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At a 1kHz sensor rate with 4ms inference, should the Cortex-M4 drop samples for immediate inference or batch 10 samples?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0952", "title": "T4 Edge Dynamic Batching for Strict 50ms SLA", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should you choose the dynamic batching timeout and max batch size to meet the 50ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0953", "title": "Edge New 0014", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For the 60 FPS robotics loop, should image preprocessing stay on the ARM CPU or be offloaded to the GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0955", "title": "Edge New 0016", "topic": "duty-cycling", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the IoT node continuously analyze data or duty-cycle to run inference once every 100ms, and what is the energy tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0956", "title": "Edge New 0017", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the Orin AGX robotics app use 15W or 60W mode to meet 30 FPS, and how do their FPS-per-watt efficiencies compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0957", "title": "Edge New 0019", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Does lowering the Orin GPU clock by 20% reduce energy per inference despite a 25% latency increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0958", "title": "Edge New 0021", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For 10MB frames at 30 FPS on Orin, should the pipeline use explicit CPU-to-GPU copies or unified zero-copy memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0959", "title": "Edge New 0022", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the accelerator keep intermediate tensors in GDDR6 or offload them to host DDR4 over PCIe Gen4 to save VRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0962", "title": "Graceful Degradation Under Thermal Throttling", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "When Orin throttles to 100 TOPS but the pipeline needs 120 TOPS at 30 FPS, how should it gracefully degrade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0963", "title": "The KV Cache Checkpoint Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What inference latency overhead comes from checkpointing a 1GB LLM KV cache over PCIe Gen4 every 10 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0964", "title": "T4 Maximum Continuous Batch Size for LLM Strict Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What maximum continuous batch size on the T4 maximizes LLM throughput without exceeding the 100ms per-token SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0966", "title": "Edge New 0030", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should the T4 run a 500ms LLM prefill monolithically or use chunked prefill while decoding at 20ms/token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0967", "title": "Dynamic KV-Cache Management via PageAttention on Edge T4", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the 16GB T4 implement PageAttention for dynamic KV-cache management, and what tradeoff does it create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0968", "title": "Edge New 0032", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 50GFLOP inference and 5MB image over a 50MB/s 5G link, should the T4 run locally or offload to an H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0969", "title": "Edge New 0033", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For 10KB accelerometer windows, should the wearable transmit raw BLE data or run local inference and send a 10-byte result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0970", "title": "Edge New 0034", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For split CNN inference over a fluctuating 10–50MB/s 5G link, should the Orin send the 5MB raw image or a 1MB intermediate tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0971", "title": "Edge New 0035", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 30B transformer across 4 Orins on 1Gbps Ethernet, should you use pipeline parallelism or tensor parallelism, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0972", "title": "Bursty Edge Queue Diagnosis", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many frames are queued to cause the 220 ms latency, and what characteristic of the arrival process must be changing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0975", "title": "Edge Swarm Feature Map Aggregation over 5G", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the exact compression ratio required to sustain real-time processing and prevent uplink saturation?", "visual": {"kind": "svg", "path": "edge-0975.svg", "alt": "A fanout diagram showing multiple drones funneling data into a single constrained wireless network node, which then connects to an aggregation server.", "caption": "Swarm Uplink Architecture"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0976", "title": "Asymmetric Split-Computing Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency of a single inference and the maximum sustained pipeline throughput in inferences per second?", "visual": {"kind": "svg", "path": "edge-0976.svg", "alt": "A Gantt chart illustrating the staggered execution of M4 compute, Bluetooth transfer, and NPU compute across multiple pipeline cycles.", "caption": "Split-Computing Pipelined Execution"}, "chain_ids": ["edge-chain-auto-secondary-017-21"], "chain_positions": {"edge-chain-auto-secondary-017-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0977", "title": "Edge NPU Queueing Under Bursty Loads", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the utilization of the Hailo-8 NPU and the average queue length using M/D/1 queueing principles?", "visual": {"kind": "svg", "path": "edge-0977.svg", "alt": "A line plot showing the hockey-stick curve of queue length increasing exponentially as NPU utilization approaches 1.0.", "caption": "M/D/1 Queue Length vs Utilization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0979", "title": "Autonomous Vehicle Edge Data Pruning Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the required data drop rate (selectivity) the A17 model must achieve to prevent the SSD from overflowing during the 8-hour shift?", "visual": {"kind": "svg", "path": "edge-0979.svg", "alt": "A stepped bar chart showing data volume decreasing massively from 144 TB Raw Sensor Data down to 16 TB Saved Data after passing through the A17 Neural Engine filter.", "caption": "Data Volume Reduction Pipeline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0980", "title": "Wake-on-VAD Duty Cycling Battery Life", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the average daily energy consumption in milli-Watt-hours (mWh) and expected battery life on a 3000 mWh cell, and does it meet a 1-year target?", "visual": {"kind": "svg", "path": "edge-0980.svg", "alt": "A sleep/wake timeline showing power consumption spikes to 5mW every 100ms with a baseline of 0.1mW.", "caption": "VAD duty cycle power profile."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0982", "title": "Edge Intermittent Power Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the optimal checkpoint interval (in seconds) that balances checkpoint overhead against recompute time to maximize useful training progress?", "visual": {"kind": "svg", "path": "edge-0982.svg", "alt": "A graph showing checkpoint overhead decreasing and failure recompute time increasing as the checkpoint interval grows, with a minimum cost at 268 seconds.", "caption": "Optimal checkpoint interval trade-off."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0985", "title": "Multi-Camera Ingestion on Jetson Edge", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should the data pipeline from camera ingestion to NPU inference be designed to prevent main LPDDR5 memory bandwidth starvation and frame dropping?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0995", "title": "Analyzing Activation Memory Footprint for MobileNetV3 on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What peak activation memory should you expect for MobileNetV3-Large on Orin at batch 1 and 224x224 input?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0996", "title": "Designing a Memory-Constrained Training Loop for On-Device Fine-Tuning on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the LoRA fine-tuning loop on Orin to fit the 3B VLM within 16GB total memory while achieving at least 5 training steps per minute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0997", "title": "Evaluating Activation Memory Impact of Batch Size Scaling on Jetson Orin Inference", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is YOLOv8-L memory scaling from batch 1 to 8 linear, and what likely causes the OOM at batch 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0998", "title": "Evaluating Hailo-8 vs. Jetson Orin Activation Memory Architecture for Embedded CV", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which platform, Hailo-8 or Jetson Orin, better handles EfficientDet-D2 activation memory at 512x512, and what constraints matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0999", "title": "Fluency: Explaining Activation Memory vs. Weight Memory Trade-off for Edge Practitioners", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why can a MobileNetV3 with only 5MB of weights consume over 200MB during inference on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1000", "title": "Fluency: Identifying When Gradient Checkpointing Helps vs. Hurts on Constrained Edge Hardware", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why might gradient checkpointing on Orin increase ResNet training time by 80% instead of the expected 33%?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1001", "title": "Implementing Streaming Activation Processing for Video on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How can you stream a 32-frame SlowFast-R50 model so activations fit within 6GB without accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1002", "title": "Mastery: Proving the Memory-Compute Optimality of Gradient Checkpointing Under Edge Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What checkpoint interval k minimizes training step time for the 50-layer ResNet under a 500MB activation budget, and why is it optimal?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 4}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1003", "title": "Mastery: Activation Memory Analysis for Recurrent Models on Ultra-Low-Power Edge Devices", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Does BPTT for Mamba-130M over T=100 fit in 6 MB SRAM, and what should you use if it does not?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1004", "title": "Optimizing Peak Activation Memory via Layer Fusion on Jetson Orin TensorRT", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What remaining TensorRT ResNet-50 activation buffers cause the 1.8GB peak at batch 32, and how can you reduce it below 1GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1005", "title": "Realizing Activation Memory Reduction Through Quantization-Aware Fine-Tuning on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does QAT for MobileNetV2 use more activation memory than fp32 training on the edge device, and how can you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1006", "title": "Realizing Tiled Activation Processing for CNNs on Hailo-8 SRAM Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How does Hailo-8 handle the ResNet-34 7x7 stem activation that exceeds SRAM, and what latency tradeoff results?", "chain_ids": ["edge-chain-auto-secondary-003-19"], "chain_positions": {"edge-chain-auto-secondary-003-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1007", "title": "Recall: Defining Activation Memory and Its Relationship to Sequence Length", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What exactly is activation memory, and why does it scale with sequence length for Transformers but not for CNNs?", "chain_ids": ["edge-chain-auto-secondary-003-21"], "chain_positions": {"edge-chain-auto-secondary-003-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1008", "title": "Specifying Activation Memory Constraints for a Multi-Model Inference Pipeline on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What peak activation memory bounds should you specify so YOLOv8-L, SegFormer-B2, and BEVFusion fit concurrently in 32GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1009", "title": "Recall MobileNet depthwise separable FLOP savings on Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why are MobileNet architectures preferred over ResNets for edge vision on Jetson Orin, and how do depthwise separable convolutions help?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1010", "title": "Fluency: compare Hailo-8 vs Jetson Orin for MobileNet inference efficiency", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the true energy efficiency of this inference, and why is it a mistake to report it as 0.088 TOPS/W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1011", "title": "Fluency: explain MobileNet width multiplier for resource-constrained edge deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What does a 0.5 MobileNetV3 width multiplier mean, how does it affect FLOPs and accuracy, and which value should Hailo-8 use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1012", "title": "Implement MobileNetV3 INT8 quantization for Hailo-8 dataflow compilation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you compile MobileNetV3-Large for Hailo-8 and handle INT8 quantization of its H-swish activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1013", "title": "Optimize EfficientNet-Lite for Jetson Orin with TensorRT and INT8 calibration", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the actual GPU utilization for the 21 GFLOPs workload on a 69 TFLOPS Orin profile, and why is 30% wrong?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1014", "title": "Optimize MobileNetV2 latency on Jetson Orin via layer pruning and depthwise op scheduling", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you optimize the depthwise convolutions specifically for the Jetson Orin's Ampere architecture to reduce this latency?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1015", "title": "Realization: deploy EfficientNet-Lite2 on Jetson Orin for multi-stream video inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you build the GPU preprocessing and batched TensorRT pipeline for 8 1080p streams at 15 FPS on one Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1016", "title": "Realization: deploy MobileNetV3 on Hailo-8 with model partitioning for multi-task inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you compile the two-head MobileNetV3 model on Hailo-8 so the 0.18 GFLOP backbone runs only once?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1017", "title": "Mastery: efficient CNN architecture selection for power-constrained edge deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which of the three MobileNet-family models would you choose for the 5 FPS solar Hailo-8 wildlife camera, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1018", "title": "Specification: define CNN model requirements for Hailo-8 production deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What complete model specification would you write for a 20-class, 30 FPS MobileNet-family classifier on Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1019", "title": "Analyzing RAG Feasibility on Jetson Orin for On-Device Retrieval", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is on-device RAG over 50,000 manuals feasible on Jetson Orin under a 3-second response budget, and what bottleneck dominates?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1020", "title": "Designing a Cascaded Model Pipeline on Jetson Orin + Coral TPU", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should you partition the cascaded classifier pipeline between the Jetson Orin and Coral Edge TPU to stay within a 40W total budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1021", "title": "Explaining Agentic Tool-Use Constraints on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the 8-tool-call agent slow from 4 seconds in the cloud to 47 seconds on the edge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1022", "title": "Implementing Offline-First RAG with Incremental Index Updates on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you update the 10,000-document FAISS index incrementally so new documents are searchable within 30 seconds and offline operation continues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1023", "title": "Implementing Model Routing Between Coral TPU and Jetson Orin Based on Input Complexity", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a gatekeeper routing layer for simple commands versus complex LLM queries to maintain low latency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1024", "title": "Mastering Context Window Management for Long-Running Edge Agents", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Design a context compression strategy that preserves critical information while keeping prefill under 500ms.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1025", "title": "Mastering Power-Aware Pipeline Scheduling on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you schedule the three Jetson Orin pipeline stages to keep average power under 35W over any 10-second window?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1026", "title": "Optimizing FAISS Index Type Selection for Jetson Orin Memory Constraints", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which FAISS index should you use for 500,000 768-dim embeddings on Orin, and what IVF cluster count and nprobe would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1027", "title": "Realizing a Two-Stage Compound Pipeline on Coral TPU + Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using an M/D/1 queue at ρ=0.63, what is the average Orin queue length for flagged Coral TPU frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1028", "title": "Recalling Key Constraints of Coral Edge TPU for Compound Pipelines", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the fundamental hardware constraints of the Coral Edge TPU that determine whether this deployment is feasible?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1029", "title": "Recalling Jetson Orin Memory Architecture for Multi-Model Compound Systems", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Does LPDDR5 unified memory on Jetson Orin mean the CPU, GPU, and DLA all share the same physical DRAM, and what are the bandwidth implications for concurrent multi-model execution?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1030", "title": "Theoretical Throughput Calculation for Edge AI Models", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the theoretical FPS for a 219M-MAC INT8 model on a 275 TOPS Jetson Orin, and why is 627 FPS off by 1000x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1031", "title": "Dataset Curation: Design On-Device Data Collection Pipeline for Edge Models", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the on-device quality filtering and upload pipeline for 10,000 Orin devices capturing 100 images per day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1032", "title": "Dataset Curation: Evaluate Centralized vs Federated Data Curation for Edge Fleet", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do centralized upload, federated updates, and a hybrid approach compare for the 5,000-device warehouse data pipeline in terms of data quality, bandwidth, privacy, and model improvement rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1033", "title": "Dataset Curation: Evaluate Active Learning Strategies for Edge-Deployed Models", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compare entropy, margin, and random image selection for active learning on the 200 inference/sec Jetson Orin detector?", "chain_ids": ["edge-chain-auto-secondary-003-22"], "chain_positions": {"edge-chain-auto-secondary-003-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-22": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1034", "title": "Dataset Curation: Implement On-Device Data Augmentation Budget for Jetson Orin", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you implement augmentation for 64-image batches and verify it stays under the 10 ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1035", "title": "Federated Learning Data Strategy for Industrial Edge Fleet", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What federated learning data strategy would you use for 50,000 non-IID Jetson Orin devices to reach 95% defect precision?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1036", "title": "Dataset Curation: Mastery — Edge-Aware Data Curation for Continual Learning", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the 8GB replay buffer and update schedule to adapt to new product classes without catastrophic forgetting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1037", "title": "Dataset Curation: Optimize Label Efficiency for Edge Model Training", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you choose 10,000 of the 100,000 unlabeled images to maximize mAP gain under the $5,000 labeling budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1038", "title": "Dataset Curation: Optimize On-Device Dataset Compression for Storage-Constrained Edge", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you store 50,000 training examples within the 4GB budget without significant quality loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1039", "title": "Dataset Curation: Realize Training Data Pipeline for Edge Model Refresh", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the monthly model refresh pipeline from edge collection to cloud fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1040", "title": "Dataset Curation: Specify Data Quality Requirements for Safety-Critical Edge Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What quantitative data quality SLAs would you set for the pedestrian detector to achieve a < 0.1% miss rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1041", "title": "Fault Tolerance: Design Checkpoint Strategy for Jetson Orin Continuous Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design fault tolerance for model state and continuity across power loss, storage corruption, and model degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1042", "title": "Fault Tolerance: Evaluate Warm vs Cold Recovery for Edge Device Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the failed Orin device, how do cold cloud recovery and warm NVMe snapshot recovery compare in latency and state freshness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1043", "title": "Fault Tolerance: Evaluate Redundancy Strategies for Mission-Critical Edge Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which redundancy architecture meets the 99.99% uptime requirement for real-time inspection, and how do they compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1044", "title": "Fault Tolerance: Fluency — MTBF Calculation for Jetson Orin Fleet", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 1,000 outdoor Orin devices with 40,000-hour MTBF, what monthly failure rate and spare inventory are needed for 98% availability?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1045", "title": "Fault Tolerance: Fluency — Checkpoint Size for Edge Online Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What should each YOLOv8-medium LoRA checkpoint include, and what are its size, write time, and 30-day storage use?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 1}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1046", "title": "Fault Tolerance: Implement Heartbeat and Health Monitor for Edge ML Device", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you monitor 2,000 Orin devices to alert within 5 minutes on latency, accuracy, or power degradation, and what bandwidth is required?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 2}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1047", "title": "Fault Tolerance: Implement OTA Model Update with Rollback for Jetson Fleet", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the OTA model update protocol with automatic rollback so a bad 200MB update affects less than 1% of the 500-device fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1048", "title": "Fault Tolerance: Mastery — Fault Tolerant Edge Fleet for Autonomous Industrial Vision", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What complete fault-tolerance architecture would you design for 1,000 online-learning Orin vision systems to achieve 99.95% uptime?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 4}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1049", "title": "Fault Tolerance: Mastery — Recovery Time Analysis for Edge Learning System", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you analyze the three Orin failure recovery paths and improve each to meet a 5-minute RTO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1050", "title": "Fault Tolerance: Optimize Checkpoint Frequency for Battery-Powered Edge Device", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much battery does checkpointing 2MB LoRA weights every 5 minutes consume, and is it worth reducing the frequency?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 2}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1051", "title": "Fault Tolerance: Realize Checkpoint Storage Architecture for 200-Device Jetson Fleet", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage tiers would you specify for 30-day LoRA checkpoint history, 1-hour cloud backup, and 5-minute recovery across 200 Orin devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1052", "title": "Fault Tolerance: Realize Fault Detection Latency for Real-Time Edge Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you implement fault detection on the 60 fps system to catch inference failures within 500ms with alerting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1053", "title": "Fault Tolerance: Recall — What is MTBF and Why Does It Matter for Edge ML?", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is MTBF, how should it determine checkpoint frequency for online learning, and what is the optimal checkpoint interval formula?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 0}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1054", "title": "Fault Tolerance: Specification \\u2014 Define Reliability SLA for Safety-Critical Edge Vision", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What reliability specification would you write to meet SIL-2, including metrics, checkpoints, detection, and manual override?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 3}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1055", "title": "Kernel Fusion: Recall — Why is Kernel Fusion Critical for Jetson Orin?", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does kernel fusion matter significantly more on an edge SoC than on an H100, and what is the effective memory bandwidth constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1056", "title": "Kernel Fusion: Design Efficient Inference Kernel for MobileNet on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the activation size of a 112x112x16 depthwise-conv output in float32, and why is 6.4MB wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1057", "title": "Kernel Fusion: Evaluate ONNX Runtime vs TensorRT Fusion on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do ONNX Runtime with CUDA EP and TensorRT compare for ResNet-50 on Jetson Orin in latency, throughput, and power efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1058", "title": "Kernel Fusion: Evaluate Depthwise Convolution Fusion Strategies on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the 56x56x32 MobileNetV2 inverted residual block, which fusion strategy is viable and what memory and latency savings should you expect?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 3}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1059", "title": "Kernel Fusion: Mastery — Optimize Inference Latency for Real-Time Edge Vision", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the theoretical INT8 compute time for 8.7B ops on a 275 TOPS Jetson Orin, and what TOPS/GOPS unit mistake should you avoid?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1060", "title": "Kernel Fusion: Optimize Inference Kernel for Depthwise Separable Convolution", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "With DWConv taking 35% of time at 2 OPS/byte versus a 4 OPS/byte ridge point, what is the bottleneck and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1061", "title": "Kernel Fusion: Optimize Int8 Fusion Pipeline for Hailo-8 NPU", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you reduce YOLOv5s latency from 8ms to 5ms on Hailo-8 by exploiting compile-time dataflow fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1062", "title": "Kernel Fusion Impact for EfficientDet on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For EfficientDet-D0 on Jetson Orin, how much latency is saved when TensorRT fuses the ONNX graph from 236 nodes to 42 nodes, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1063", "title": "Kernel Fusion: Realize Fusion Gain Analysis for Transformer on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 6-layer BERT-tiny at batch 4 and sequence 128 on Jetson Orin, how do unfused and fused kernels differ in bandwidth needs and speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1064", "title": "Kernel Fusion: Specification — Define Fusion Requirements for Safety-Critical Edge Vision", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What kernel fusion specification would you require for the forklift pedestrian detector to meet 60 fps under 50W with safety validation?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1065", "title": "Latency Decomposition: Compare Jetson Orin vs. Hailo-8 for Object Detection Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the realistic Platform A inference time for a 6.5 GFLOP INT8 YOLOv8-small model, and why is an estimate of 59ms mathematically wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1066", "title": "Latency Decomposition: Compare TFLite vs. TensorRT Inference Latency on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For MobileNetV3-Large on Jetson Orin, how do TFLite GPU delegate and TensorRT FP16 compare in end-to-end inference latency and overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1067", "title": "Latency Decomposition: Compute Prefill Latency for Edge LLM on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical and 60%-utilization prefill latency should you estimate for a 1B LLM processing a 128-token prompt in INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1068", "title": "Latency Decomposition: Full Pipeline Latency Audit for Autonomous Drone on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Assuming the accelerator provides 110 effective TOPS, can the sequential pipeline meet the 30ms budget, what is the bottleneck, and what fix is needed?", "chain_ids": ["edge-chain-auto-secondary-001-05"], "chain_positions": {"edge-chain-auto-secondary-001-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1069", "title": "Latency Decomposition: Diagnose and Fix Jetson Orin Inference Latency Spike", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What likely causes 8ms-to-45ms latency spikes on the 2 GOP INT8 safety classifier, and what quantified fix would you apply?", "chain_ids": ["edge-chain-auto-secondary-001-05"], "chain_positions": {"edge-chain-auto-secondary-001-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1070", "title": "Latency Decomposition: Size E2E Latency for Smart Camera on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency budget for the 1080p Jetson Orin smart camera pipeline, and can it process every frame at 30fps?", "chain_ids": ["edge-chain-auto-secondary-001-05"], "chain_positions": {"edge-chain-auto-secondary-001-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1071", "title": "MLOps Lifecycle: Compare OTA Update Strategies for Fleet of Jetson Orin Edge Devices", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 5000 Jetson Orins, how do 500MB full-model OTA updates compare with 50MB delta updates in bandwidth cost, update time, and fleet-wide failure risk?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 1}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1072", "title": "MLOps Lifecycle: Compare A/B Testing at Edge vs. Cloud for Model Updates", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which A/B testing strategy (edge-side deployment or cloud-side streaming) is more viable for 1000 Jetson Orin devices, considering bandwidth and operational costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1073", "title": "MLOps Lifecycle: End-to-End MLOps for Autonomous Vehicle Edge Fleet", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the MLOps lifecycle for 10,000 edge units, and quantify daily update bandwidth, per-vehicle storage, and monitoring volume?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 3}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1074", "title": "MLOps Lifecycle: Diagnose Edge Fleet Model Drift and Quantify Retraining Trigger", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What likely caused the customer-count mAP drop from 0.87 to 0.79, and what quantitative monitoring and retraining trigger would catch it earlier?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1075", "title": "MLOps Lifecycle: Size Storage and Bandwidth for Edge Model Registry", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size the registry storage, per-device storage, update bandwidth, and LTE transfer time for 2000 Jetson Orins with 150MB models?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 0}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1076", "title": "MLOps Lifecycle: Size Edge Model Registry Storage for 5 Versions Retention", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size cloud storage, per-device eMMC impact, monthly bandwidth, and eMMC endurance for 500 devices with weekly 80MB updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1077", "title": "MLOps Lifecycle: Specify Monitoring and Alerting System for Edge CV Fleet", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What metrics, sampling rates, alert thresholds, and data volumes would you use to monitor 1000 Jetson Orin vision devices at remote sites?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1078", "title": "Model Format Conversion: Compare ONNX vs. TensorRT for Jetson Orin Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do ONNX-to-TensorRT INT8 and TorchScript direct inference compare for ResNet-50 on Jetson Orin in latency, complexity, and portability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1079", "title": "Model Format Conversion: Compare TFLite vs. ONNX Runtime for ARM Cortex on Jetson", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For MobileNetV2 on Jetson Orin Cortex-A78AE CPUs, how do TFLite XNNPACK and ONNX Runtime ACL compare for FP16 latency, memory, and op coverage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1080", "title": "Model Format Conversion: End-to-End ONNX to Hailo-8 Conversion Pipeline Mastery", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you convert the custom PyTorch detector, handle unsupported layers, quantify INT8 accuracy loss, and size the compiled file?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1081", "title": "Model Format Conversion: Diagnose TensorRT Conversion Failure for Custom Attention Ops", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What causes the TensorRT INT8 conversion failure on ScaledDotProductAttention for ViT-Base, how would you fix it, and what speedup should result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1082", "title": "Model Format Conversion: Size TensorRT Engine Storage for Multi-Model Jetson Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the TensorRT engine sizes, runtime memory fit, and NVMe load time for the three Jetson Orin models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1083", "title": "Model Format Conversion: Specify ONNX Conversion Pipeline with Validation for Edge", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What CI/CD stages and pass/fail criteria would you specify for this PyTorch-to-ONNX conversion pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1084", "title": "Model Size Estimation: Analyze Why INT8 Doesn't Always Halve Edge Inference Time", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT8 quantization of ResNet-50 on Jetson Orin yield only 1.3x speedup instead of 2x, using compute and bandwidth bottleneck analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1085", "title": "Model Size Estimation: Design Memory Layout for Multi-Model Edge Vision System", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you lay out memory so all four Jetson Orin vision models stay resident and switch in under 5ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1086", "title": "Model Size Estimation: Compare Memory Footprint: Full Model vs. Quantized on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much memory does YOLOv8-l use on Jetson Orin in FP32, FP16, and INT8 TensorRT, and how many concurrent instances fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1087", "title": "Model Size Estimation: Compare MobileNet vs. EfficientNet Memory on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do MobileNetV3-Large and EfficientNet-B3 compare in INT8 TensorRT memory, latency, and accuracy per MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1088", "title": "Model Size Estimation: Fluency — Size Edge Vision Model Memory in 60 Seconds", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much LPDDR5 memory does a YOLOv8-n model (3.2M params, INT8 TensorRT) need for this pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1089", "title": "Model Size Estimation: Master Full Memory Audit for Edge LLM Deployment on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the INT8 KV-cache memory per token for the 1B LLM with 32 layers, 32 KV heads, and head_dim 64?", "chain_ids": ["edge-chain-auto-secondary-003-27"], "chain_positions": {"edge-chain-auto-secondary-003-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1090", "title": "Model Size Estimation: Master Memory Tradeoff for Precision vs. Context Length on Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the FP16 KV-cache memory per token for a 1B LLM with 32 layers, 32 KV heads, and head_dim 32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1091", "title": "Model Size Estimation: Diagnose Memory Pressure Causing Swapping on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the Jetson Orin with 28GB FP16 weights on 32GB LPDDR5 start swapping and slow down 50%, and what quantified fix eliminates it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1092", "title": "Model Size Estimation: Diagnose GPU Memory OOM on Jetson Orin for Batch Inference", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing the OOM when moving from batch 4 to 8, and what maximum batch size should fit on the Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1093", "title": "Model Size Estimation: Realize Memory Layout for CV Pipeline on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What LPDDR5 memory budget should you allocate for this 4K camera, 3-model, H.265 pipeline on the 32GB Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1094", "title": "Model Size Estimation: Realize Minimum Hardware Spec for Multi-Model Hailo Deployment", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much host RAM is needed, do the three models fit in SRAM, and what is the model-switching time via 5GB/s PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1095", "title": "Model Size Estimation: Specify Memory Budget for Edge LLM with Fixed 16GB", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What LLM family, precision, parameter count, KV-cache budget, and Time Per Output Token (TPOT) would you deploy within the 16GB constraints?", "chain_ids": ["edge-chain-auto-secondary-003-27"], "chain_positions": {"edge-chain-auto-secondary-003-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1096", "title": "MLOps Lifecycle: Optimize OTA Update Pipeline for Hailo-8 Fleet", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How can the 10,000-device Hailo-8 OTA update be reduced from 100 hours to under 1 hour without increasing CDN cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1097", "title": "Model Format Conversion: Realize TensorRT Engine Storage for Dual-Model Hailo Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HEF storage and SRAM are required, do both Hailo-8 models fit simultaneously, and what switching overhead remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1098", "title": "Edge Queueing Theory Recall: Little's Law at the Network Edge", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Using Little's Law, what is the mean latency per frame, and what does it imply for real-time video at 30 FPS?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 0}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1099", "title": "Edge Queueing Theory Analyze: Why Jetson Orin Pipeline Stalls", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using queueing theory, why can the 4-camera pipeline have 800ms latency at only 40% GPU utilization, and what stage is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1100", "title": "Edge Queueing Theory Analyze: Tail Latency on Battery-Constrained Jetson", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the M/M/1 utilization and P99 latency at 60W and 25W, and what happens if throttling drops service below arrivals?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1101", "title": "Edge Queueing Theory Design: Multi-Model Pipeline on Jetson Orin", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using the correct M/D/1 formula, what is the queue wait for the 8 ms detection stage at 30 FPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1102", "title": "Edge Queueing Theory Design: Burst Traffic Handling with Jetson Buffer", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How large should the burst buffer be, what overflow policy should it use, and what maximum queue latency occurs during the 80 events/s bursts?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 2}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1103", "title": "Edge Queueing Theory Design: Hailo-8 vs Jetson Orin Queueing Analysis", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For λ=50 detections/s and a 5 GOPS model, what are throughput, utilization, queue wait, and power efficiency on Hailo-8 versus Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1104", "title": "Edge Queueing Theory Diagnosis: Jetson Orin Pipeline Latency Spike", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What root cause explains the 500 ms obstacle-detection spikes despite 35% GPU utilization, and how much latency can the fix remove?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1105", "title": "Edge Queueing Theory Diagnosis: Hailo-8 Throughput Under Real Traffic", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Hailo-8 retail pipeline deliver only 8 FPS on a 30 FPS camera, and how large is the gap to expected MobileNetV2 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1106", "title": "Edge Queueing Theory Diagnosis: Queue Overflow in Edge Gateway", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using M/M/1/K queueing theory, what explains the exact 25% drop rate, and how should this system overload be resolved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1107", "title": "Edge Queueing Theory Evaluation: Jetson Orin vs Hailo-8+Pi Latency at Low Load", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For option B at λ=20/s and μ=40/s, what is the correct M/D/1 queue wait term before comparing latency and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1108", "title": "Edge Queueing Theory Evaluation: Coral Edge TPU vs Hailo-8 for TinyML Inference", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the Coral Edge TPU at λ=1000/s and μ=1333/s, what is the correct M/D/1 queue wait Wq before comparing to Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1109", "title": "Edge Queueing Theory Evaluation: Static vs Dynamic Batching on Jetson Orin", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 40 req/s, should the Jetson Orin use static batch=8 with a 200 ms wait or dynamic batching with a 20 ms timeout?", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 3}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1110", "title": "Edge Queueing Theory Fluency: Little's Law Mental Math for Edge Systems", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the P99 latency, accelerator queue length and mean wait, and maximum service time for the three quick edge queueing estimates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1111", "title": "Edge Queueing Theory Fluency: Back-of-Envelope Edge Queue Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the Jetson Orin handle 4x30 FPS ResNet-50 streams, and what are its capacity, utilization, queue wait, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1112", "title": "Edge Queueing Theory Fluency: M/M/1 Metrics at Critical Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are Wq, W, and P99 for Hailo-8 vehicle detection at 70%, 90%, and 99% utilization, and where is the latency knee?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1113", "title": "Edge Queueing Theory Implement: Service Rate from Latency Spec", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the face-recognition pipeline runs at mu=200/s and rho=0.5, what M/D/1 Wq and total latency verify the P99 < 50ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1114", "title": "Edge Queueing Theory Implement: Little's Law for Edge Buffer Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What buffer size (in frames and MB) is required to satisfy Little's Law given the strict 100ms latency maximum?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 1}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1115", "title": "Edge Queueing Theory Mastery: End-to-End Edge Inference System Design", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For a Hailo-8 design with 2 cameras per chip, what is the per-chip arrival rate and utilization?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 4}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1116", "title": "Edge Queueing Theory Mastery: Queueing Analysis for Robot Fleet Coordinator", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How many Jetson Orins are needed for the mixed heavy/light workload, can priority queues meet the 500 ms P99 SLO, and does power fit the 200W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1117", "title": "Edge Queueing Theory Optimization: Reduce Jetson Orin P99 by Model Cascading", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much can the two-stage small/large model cascade reduce P99 latency, and is Stage 2 stable without adding hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1118", "title": "Edge Queueing Theory Optimization: Hailo-8 Throughput via Request Coalescing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "If adjacent frames are coalesced in pairs, what are the new utilization and P99 latency, and does it meet the 200ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1119", "title": "Edge Queueing Theory Optimization: Memory-Aware Scheduling on Jetson Orin", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the memory-bound decode queue at \\u03bb=8/s and \\u03bc=9.7/s, what is the correct queue wait and why does caching matter?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1120", "title": "Edge Queueing Theory Realization: Concrete Queue Depth for Coral Edge TPU", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many input requests and bytes must be buffered to meet the 500ms P99 SLO, and can the 512KB Cortex-M7 SRAM hold them?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 2}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1121", "title": "Edge Queueing Theory Realization: End-to-End Latency Budget for Edge Pipeline", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the end-to-end P99 latency modeled as G/D/1, and which stage should be optimized first?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1122", "title": "Edge Queueing Theory Realization: Jetson Orin Memory vs Compute Bottleneck", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Concretely determine whether a Jetson Orin serving EfficientNet-B4 at batch=1 is compute-bound or memory-bandwidth-bound at 100 req/s, and what are the resulting M/D/1 metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1123", "title": "Edge Queueing Theory Specification: P99 SLO for Robotic Arm Control", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Should the robotic arm use Jetson Orin or Hailo-8 for the 100Hz vision task, and what scheduling policy guarantees the 5ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1124", "title": "Edge Queueing Theory Specification: Design Smart Meter Edge Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What buffer size, scheduling policy, latency SLO, and power budget should the Hailo-8 smart-meter anomaly detector use?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 3}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1125", "title": "Edge Queueing Theory Specification: Multi-Tenant Edge Inference SLO", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What priority levels and queueing policy should the shared Jetson Orin use for tenants A, B, and C, and do their P99 SLOs hold?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 3}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1126", "title": "Edge Systolic Array Analyze: Why Hailo-8 Dataflow Wins for Computer Vision", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Hailo-8 architecture achieve significantly better energy efficiency than the GPU for this workload, and how much DRAM traffic is avoided?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1127", "title": "Weight-Stationary Dataflow for Edge Deployment", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is batch=1 ResNet-50 on Jetson Orin memory-bound, what throughput does Roofline predict, and how does batch=64 change it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1128", "title": "Edge Systolic Array Design: Optimal Tiling for Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should the first YOLOv8-nano convolution be tiled for Jetson Orin DLA SRAM, and what latency should the two DLA cores achieve?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 3}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1129", "title": "Edge Systolic Array Design: Hailo-8 Dataflow for Video Analytics", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What TDM schedule should the accelerator use for the three vision models, and what latency does each model see?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1130", "title": "Edge Systolic Array Design: Systolic Array for Conv Layer on Jetson DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should an output-stationary dataflow run the MobileNetV2 depthwise convolution, and how does it compare with 1x1 pointwise convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1131", "title": "Edge Systolic Array Evaluation: Hailo-8 vs Coral TPU Dataflow Efficiency", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which chip achieves better Model FLOPs Utilization (MFU) deploying EfficientNet-Lite0 at batch size 1, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1132", "title": "Edge Systolic Array Evaluation: Weight-Stationary vs Row-Stationary on Jetson", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 32-channel 112x112 3x3 depthwise conv, how do WS and RS dataflows compare in SRAM access and energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1133", "title": "Edge Systolic Array Evaluation: Output Stationary for Attention on Edge", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For QK^T with seq=64 and head_dim=64, is output-stationary or weight-stationary dataflow better, and what are the memory access counts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1134", "title": "Edge Systolic Array Fluency: Arithmetic Intensity Estimation for Edge Models", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Without looking up formulas, estimate the arithmetic intensities for MobileNetV2, the 3x3 conv, and the 1x1 conv on Hailo-8, and state whether they are memory or compute bound.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1135", "title": "Edge Systolic Array Fluency: Roofline Model Mental Math for Jetson", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the three Jetson Orin operations, are they memory or compute bound and what limiting throughput or latency do you estimate?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 1}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1136", "title": "Edge Systolic Array Implement: GOPS to FPS Conversion for Edge Models", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What FPS and energy per frame can YOLOv5s achieve on Hailo-8, Jetson Orin, and Coral, and which bottleneck limits each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1137", "title": "Edge Systolic Array Implement: Depthwise Convolution Throughput Calculation", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the MobileNetV3 28x28 DW+PW block on Hailo-8, what are the memory accesses, latency, throughput, and bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1138", "title": "Edge Systolic Array Mastery: Full EfficientNet Roofline on Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For EfficientNet-B0 on Orin DLA, which layers are memory or compute bound, what MFU do you expect, and what should run on GPU vs DLA?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 4}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1139", "title": "Edge Systolic Array Mastery: Optimal Dataflow Selection for Edge Deployment", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For ViT-Tiny on Jetson Orin, should attention, FFN, and patch embedding run on GPU or DLA, and what latency and energy meet 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1140", "title": "Edge Systolic Array Optimization: Fix Memory-Bound Conv on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What bottleneck explains the CNN's 11.5% MFU, and how do channel grouping and larger spatial resolution change its intensity and speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1141", "title": "Edge Systolic Array Optimization: DLA vs GPU Load Balancing on Jetson", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can MobileNetV2 move from GPU to DLA while maintaining 2000 FPS, and what power savings and maximum DLA FPS result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1142", "title": "Edge Systolic Array Realization: INT8 Quantization Impact on Hailo-8 Throughput", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the FPS, memory footprint reduction, and latency for INT8 vs the theoretical FP16 case if Hailo-8 supported it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1143", "title": "Edge Systolic Array Realization: Batch Size Effect on Edge Throughput", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For EfficientDet-Lite2 on Orin DLA at batch sizes 1, 4, and 16, what FPS, latency, energy, and bound regime do you get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1144", "title": "Edge Systolic Array Recall: Systolic Array Operation Principle", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the key systolic-array principle, and why does the Hailo-8 deliver significantly better TOPS/W than the ARM Cortex-A78 for INT8 matmul?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1145", "title": "Edge Systolic Array Specification: Design Tile Size for Edge DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tile size should you use for the 128-to-256 3x3 conv on Orin DLA, and do the exact SRAM byte counts fit in 3MB per core?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1146", "title": "Edge TCO Analyze: Total Cost of Deployment at Scale", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the 5-year TCO of 500 edge AI units compared to a cloud-based alternative at $0.50/hr?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1147", "title": "Edge TCO Analyze: Hailo-8 vs Jetson Orin Cost Per Inference Lifecycle", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which platform provides a cheaper per-inference cost when including hardware amortization and power consumption over 3 years?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1148", "title": "Edge TCO Analyze: Power Budget Dominates Edge IoT Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 10,000 Coral Edge TPU units over 5 years, what percentage of TCO comes from hardware, power, connectivity, maintenance, and software?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1149", "title": "Edge TCO Design: Optimize Jetson Orin Deployment for 3-Year ROI", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct annual power cost for one 60W Jetson Orin running 24/7 at $0.10/kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1150", "title": "Edge TCO Design: Hailo-8 ROI for Smart Agriculture", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the break-even point and 5-year ROI for deploying Hailo-8 edge inference on 1000 farms versus cloud inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1151", "title": "Edge TCO Design: Size Fleet Budget for Autonomous Vehicle Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the 5-year total cost of ownership (TCO) for the inference compute hardware across the 10,000-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1152", "title": "Edge TCO Diagnosis: Why Edge Deployment Is Losing Money", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 100-unit smart parking deployment losing money, and what monthly revenue per unit is needed to break even?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1153", "title": "Edge TCO Diagnosis: Hailo-8 Deployment Failure Root Cause", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hidden costs explain the Hailo-8 fleet overrun, and what is the true 1-year TCO versus the expected TCO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1154", "title": "Edge TCO Diagnosis: Connectivity Cost Surprise in IoT Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 4G overage, how much does it cost annually, and how should the 2,000-unit Jetson fleet fix it?", "chain_ids": ["edge-chain-auto-secondary-001-10"], "chain_positions": {"edge-chain-auto-secondary-001-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1155", "title": "Edge TCO Evaluation: Coral TPU vs Cloud Inference for Low-Volume Deployments", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the beehive monitor at 1 inference per minute, what is the 3-year TCO for edge versus Lambda, and what is the daily break-even volume?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1156", "title": "Edge TCO Evaluation: 3 Edge Hardware Options for Retail Analytics", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the correct 3-year power costs for the 7.5W Hailo-8+Pi5 and 4W NCS2+Pi4 options at $0.10/kWh?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1157", "title": "Edge TCO Evaluation: Batch vs Real-Time Inference Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the 5-year TCO premium for real-time Jetson Orin inference versus overnight Hailo-8+Pi5 batch inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1158", "title": "Edge TCO Fluency: Quick TCO Estimation for Edge Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year TCOs for the Jetson Orin and Coral Edge TPU fleets, and at what daily inference volume does the Coral fleet beat the cloud?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1159", "title": "Edge TCO Fluency: Rapid Cost-Per-Inference Estimation", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year total cost and cost-per-inference for these systems, and how do they rank by cost-efficiency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1160", "title": "Edge TCO Fluency: ROI Calculation for Edge vs Cloud", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the corrected 3-year TCO for one edge system after including power and 5% annual maintenance?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1161", "title": "Edge TCO Implement: Calculate Cost Per Inference for Edge Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the checkout device running 5 inferences/s for 12hr/day, what is the 3-year TCO and cost per inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1162", "title": "Edge TCO Implement: Fleet Power Cost Optimization", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 1000-unit Hailo-8+Pi5 fleet, what are annual power cost at $0.10/kWh, duty-cycling savings (50% during off-hours), and 5-year NPV at a 5% discount rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1163", "title": "Edge TCO Implement: Hailo-8 vs Cloud Break-Even Analysis", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact daily inference volume break-even point between the Hailo-8+Pi5 and AWS Rekognition over 3 years?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1164", "title": "Edge TCO Mastery Multi-Tier Edge-Cloud-Hybrid Cost Optimization", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct 5-year power cost for 50 Hailo-8 edge devices operating at 7.5W continuously, assuming $0.10/kWh?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 4}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1165", "title": "Edge TCO Mastery: Full Lifecycle Cost Model for Industrial Edge AI", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the 5-year TCO, ROI, and payback period for the 200-unit predictive maintenance deployment?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 3}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1166", "title": "Edge TCO Mastery: Hardware Replacement vs Cloud Migration Decision", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the correct 4-year power cost for 300 replacement Orin units at 60W and $0.10/kWh?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1167", "title": "Edge TCO Optimization: Power Management for Always-On Edge Systems", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the 5-year power cost savings from DVFS reducing to 15W at idle, and their NPV at a 5% discount rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1168", "title": "Edge TCO Optimization: Model Compression for Edge Cost Reduction", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the correct 5-year power cost for 200 Jetson Orin units at 60W and $0.10/kWh before compression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1169", "title": "Edge TCO Optimization: Optimize Connectivity Costs Dominating TCO", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you cut the 1000-unit fleet's 4G connectivity cost by at least 90%, and what are the 5-year savings and payback?", "chain_ids": ["edge-chain-auto-secondary-001-10"], "chain_positions": {"edge-chain-auto-secondary-001-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1170", "title": "Edge TCO Optimization", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What power-management plan will cut the 500-store Hailo-8 fleet's annual power cost by 60%, and what savings result?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1171", "title": "Edge TCO Realization: Concrete 1-Year Cost Breakdown for Edge AI System", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 1-year cost breakdown across amortized hardware, power, maintenance, and connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1172", "title": "Edge TCO Realization: Scale-Up vs Scale-Out for Edge Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size the Jetson Orin and Hailo-8 options for 500 TOPS INT8, compute 3-year TCO, and compare failure modes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1173", "title": "Edge TCO Realization: Jetson Orin Fleet Power Budget for Solar Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What panel wattage, LiFePO4 battery capacity, and 5-year TCO would you size for the 24/7 remote station versus grid power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1175", "title": "Edge TCO Recall: CapEx vs OpEx Tradeoff for Edge AI", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For 100 Jetson Orin units, what are first-year CapEx, annual power OpEx, and why might a cash-constrained startup still prefer cloud OpEx?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1176", "title": "Edge TCO Specification: Design $50K Budget Deployment for Smart Building", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What complete edge AI deployment would meet 50 cameras at 30 FPS with P99 <100ms under the $50K 3-year budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1177", "title": "Edge TCO Specification: Design Lifecycle Cost Model for 5-Year Edge Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the annual fleet power cost for 100 edge devices at 60W and $0.10/kWh?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1179", "title": "Edge Transformer Cost Evaluation: Quantized vs Full Transformer on Jetson", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which is better for a real-time document triage system requiring P99 < 50ms, considering throughput, memory footprint, and latency at batch=1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1180", "title": "Edge Transformer Cost Evaluation: Distilled vs Full Transformer on Hailo-8", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which should run on Hailo-8 for batch=1 sentiment, DistilBERT or BERT-base, and what throughput and cost tradeoffs result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1181", "title": "Edge Transformer Cost Implement: BERT Inference Latency on Jetson", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1182", "title": "Edge Transformer Cost Mastery: Full Edge LLM Deployment Analysis", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Can one Jetson Orin handle Phi-3-Mini-4K INT4 for 10 simultaneous users with <2s 200-token responses, and what is its annual cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1183", "title": "Edge Transformer Cost Mastery: Quantization Tradeoff for Edge Deployment", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which quantization level is Pareto-optimal for LLaMA-3-8B on Jetson Orin under P99 <1s and <3% accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1184", "title": "Edge Transformer Cost Optimization: Optimize Edge Transformer via Pruning", "topic": "tco-cost-modeling", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is a $30K project to prune BERT-base from 110M to 55M parameters on 100 Jetson Orins justified, and what gains result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1185", "title": "Edge Transformer Cost Optimization: Knowledge Distillation for Edge Deployment", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What throughput, memory, accuracy, and annual savings would distilling BERT-large to TinyBERT deliver across 50 Jetson Orins?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1186", "title": "Edge Transformer Cost Realization: Size Phi-3 Deployment on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 2-second P99 SLO met for 100 output tokens using either INT8 or INT4 quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1187", "title": "Edge Transformer Cost Realization: FLOPs per Token for Small Transformers", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the exact decode FLOPs per token for Phi-3-Mini, how do they compare to the 2N approximation, and is the Jetson Orin compute- or bandwidth-bound?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1188", "title": "Edge Transformer Cost Specification: Design Edge Transformer SLO", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct KV cache size and max INT4 model size after reserving KV for 5 users at context 128?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1189", "title": "Achieve Mastery in Edge Queueing System Design Under Intermittent Connectivity", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you model buffering, overflow, and reconnection bursts for Hailo-8 drones disconnected for 20 minutes at 4 req/s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1190", "title": "Diagnose Dataflow Mismatch Between Hailo-8 and Attention Pattern", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the correct theoretical latency for 4.9M MACs on a 13 TOPS DLA, and why is the 25ms estimate completely wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1191", "title": "Diagnose Systolic Array Pipeline Stall on Coral Edge TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does rate-4 dilation drop the model's FPS so drastically, and how would you fix it?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 1}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1192", "title": "Realize Systolic Array Output-Stationary Tiling on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What output-stationary tiling strategy fits W[512x256] x X[256x128] on the accelerator, and what is the total SRAM traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1193", "title": "Recall Transformer Inference Latency on Hailo-8", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the expected single-image inference latency for the ViT-B/16 model?", "chain_ids": ["edge-chain-auto-017-11"], "chain_positions": {"edge-chain-auto-017-11": 0}, "chain_tiers": {"edge-chain-auto-017-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1194", "title": "Analyze Prefill vs Decode Cost Split on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do prefill and decode times split for a 7B INT4 model on Orin NX with a 512-token prompt and 128-token response, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1195", "title": "Design Speculative Decoding for Edge LLM on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What speculation window k should you choose for the 7B target and 1B draft at α=0.80, and what speedup results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1196", "title": "Design Continuous Batching for Edge LLM Serving on Hailo-8", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "For the Hailo-8 continuous batching queue, should P99 be estimated from waiting time Wq or total sojourn time W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1197", "title": "Diagnose KV Cache Eviction Causing Latency Spikes on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does KV cache migration occur at only 8 concurrent requests on Orin, and what cache-management fix removes the 2.8s tail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1198", "title": "Diagnose Decode Latency Regression from KV Cache Format on Hailo-8", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Hailo-8 transformer slow from 8ms/token to 22ms/token after step 32, and what KV layout fixes it?", "chain_ids": ["edge-chain-auto-017-11"], "chain_positions": {"edge-chain-auto-017-11": 1}, "chain_tiers": {"edge-chain-auto-017-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1199", "title": "Edge Transformer Fluency: Memory-Bandwidth-Bound Decode Calculation", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the KV cache size at context=256 for 32 layers, 32 heads, head_dim=128, and FP16, and why isn't it 1.07GB?", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 0}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1200", "title": "Edge Transformer Fluency: Prefill FLOPs on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is the attention cost per layer about 8.59 GFLOPs rather than TFLOPs for this prefill calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1201", "title": "Optimize KV Cache Quantization for Jetson Orin Memory Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which KV quantization lets 5 concurrent 16K-context requests fit in the 4.5GB KV budget, and what memory does it use?", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 2}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1204", "title": "FlashAttention for On-Device Long Document Summarization", "topic": "flash-attention", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you make a 3B model with 16K context feasible on an edge SoC, including FlashAttention, KV quantization, and attention pattern?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1206", "title": "Speculative Decoding on Jetson Orin", "topic": "speculative-decoding", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which draft strategy for speculative decoding a 7B model on Jetson Orin reduces latency most under the 102GB/s bandwidth limit?", "chain_ids": ["edge-chain-auto-secondary-017-42"], "chain_positions": {"edge-chain-auto-secondary-017-42": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1207", "title": "N-Gram Draft for Zero-Overhead Edge Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you implement K=8 prompt-lookup drafting for a 4096-token prompt on Orin, and what acceptance rates and memory cost should you expect?", "chain_ids": ["edge-chain-auto-secondary-017-42"], "chain_positions": {"edge-chain-auto-secondary-017-42": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1208", "title": "Edge TPU Operator Fallback Penalty", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU inference take 45ms instead of 5ms, and what is the exact USB I/O penalty for the 5MB tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1209", "title": "Continuous Learning Activation Memory Tradeoff", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the 7B fine-tune on the edge accelerator OOM despite 7GB weights, and how does gradient checkpointing trade compute for memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1210", "title": "Adversarial Vulnerability from Edge Quantization", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did INT8 quantization make the Jetson Orin model vulnerable to physical adversarial patches, and why was higher precision infeasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1211", "title": "Latency Spikes in Dynamic Batching on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do p99 latency spikes occur at 60 FPS with a 15ms dynamic batching timeout on Jetson Orin despite compute headroom?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1212", "title": "Latency Spikes in Cascaded Edge TPU Pipelines", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Stage 2 trigger a 20ms latency spike on Coral Edge TPU despite the pipeline needing only 0.1 TOPS of compute?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1213", "title": "Analyzing Memory-Bound INT8 Throughput on Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does throughput flatline around 82 TOPS instead of reaching the Jetson Orin's 275 TOPS INT8 peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1214", "title": "Calibration Data Pruning for Edge TPU Quantization", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did using 10,000 random calibration images improve Coral Edge TPU INT8 accuracy versus the full 1M-image dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1215", "title": "Analyzing Hailo-8 Host Memory Streaming Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the Hailo-8 starving at 15% utilization with the 1080p pipeline capped at 60 FPS, and where is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1216", "title": "INT8 Quantization Impact on Edge Data Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Given the Coral's INT8 requirement, why did the data validation accuracy degrade after the 4x sensor range increase, and what is the compute utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1217", "title": "Bandwidth Bottlenecks in Edge Data Curation", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the Hailo-8 active learning pipeline miss small distant objects after downsampling 4K video to 720p?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1218", "title": "Drift Detection Bottleneck on Dataflow Edge", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does extracting an intermediate feature map metric cause such a severe latency degradation on a dataflow architecture?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1219", "title": "Hailo-8 Host-Device DMA Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What host memory-management issue causes high CPU use and caps the Hailo-8 pipeline at 166 FPS despite low accelerator utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1220", "title": "Encoder vs Decoder Utilization on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the decoder generation arithmetic intensity on the Jetson Orin, and why does it bottleneck while the encoder does not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1221", "title": "Thermal Throttling in Memory-Bound Attention Layers", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do the self-attention layers hit the 60W TDP and throttle despite doing fewer INT8 MACs than the MLP layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1222", "title": "GPTQ 3-bit Latency Degradation on Ampere", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 3-bit GPTQ make the 70B model fit on Jetson Orin but still degrade token-generation performance?", "chain_ids": ["edge-chain-auto-secondary-006-01"], "chain_positions": {"edge-chain-auto-secondary-006-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1223", "title": "INT8 Quantization Bias on Cloud AI 100", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does global uniform INT8 quantization on the Cloud AI 100 raise Group B's false positive rate while Group A is unaffected?", "chain_ids": ["edge-chain-auto-secondary-009-30"], "chain_positions": {"edge-chain-auto-secondary-009-30": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1224", "title": "Federated Averaging Bottleneck on Jetson Orin", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Explain why the nodes are idling and why timeouts occur despite the massive compute capacity of the Jetson Orin.", "chain_ids": ["edge-chain-auto-017-01"], "chain_positions": {"edge-chain-auto-017-01": 0}, "chain_tiers": {"edge-chain-auto-017-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1225", "title": "Analyze INT8 Underutilization on Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Cloud AI 100 achieve only 50 TOPS on this INT8 kernel while its LPDDR4x interface is saturated at 100 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1226", "title": "Thermal Throttling and Task Shedding on Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why should the drone completely shed the auxiliary model instead of proportionally downscaling frequencies for both workloads under the 45W cap?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1227", "title": "Analyzing Fusion Memory Spills on Qualcomm Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does fusing the three 100-GOPS blocks into one 300-GOPS kernel make inference slower on the Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1228", "title": "Analyzing Memory Bottlenecks in Unfused Operations on Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do separate LayerNorm, add, and GELU CUDA kernels underutilize the Jetson Orin GPU, and how does fusion fix it?", "chain_ids": ["edge-chain-auto-025-14"], "chain_positions": {"edge-chain-auto-025-14": 0}, "chain_tiers": {"edge-chain-auto-025-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1229", "title": "Distillation versus Unstructured Pruning on Dataflow Accelerators", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning not speed up the Hailo-8 model while a 50-GOPS dense distilled model doubles FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1230", "title": "Latency Bottleneck on Qualcomm AI 100", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the batch-8 pipeline take 265ms end to end when accelerator inference is only 15ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1231", "title": "Dataflow Accelerator Round-Robin Degradation", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does dynamic round-robin routing cause this performance degradation on this specific hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1232", "title": "Hailo-8 Host Streaming Bandwidth Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Hailo-8 pipeline achieve only 16 FPS despite 26 TOPS of compute for a 200-GOPS-per-frame model?", "chain_ids": ["edge-chain-auto-024-02"], "chain_positions": {"edge-chain-auto-024-02": 2}, "chain_tiers": {"edge-chain-auto-024-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1233", "title": "Mmap RAM Tradeoffs for Edge TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mmap reduce host RAM from 100MB to 20MB for five Coral Edge TPU processes, and how do the INT8 weights still reach the TPU over the USB interface?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1234", "title": "Jetson Orin OOM During Batch Fine-Tuning", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does batch size 16 trigger OOM on the 32 GB Jetson Orin while 16-step gradient accumulation with micro-batch 1 fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1235", "title": "Hailo-8 INT8 Streaming Throughput and Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Assuming 8MB of INT8 activations per frame, why does host INT8 pre-quantization double Hailo-8 FPS but reduce accuracy, and what are the FPS limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1236", "title": "CI/CD Latency Skew on Hailo-8 Dataflow Accelerator", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the TOPS-based CI/CD estimate miss the Hailo-8 canary latency spike and host memory saturation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1237", "title": "TensorRT DLA Fallback Overhead on Jetson Orin", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does TensorRT INT8 partial DLA delegation increase Jetson Orin latency from 20ms to 25ms versus all-FP16 GPU execution?", "chain_ids": ["edge-chain-auto-secondary-006-10"], "chain_positions": {"edge-chain-auto-secondary-006-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1238", "title": "Edge Compute vs Thermal Throttling on Orin", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Jetson Orin throttle at 400 FPS even though 220 TOPS is below the advertised 275 TOPS INT8 peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1239", "title": "Edge TPU Inference Bottleneck Analysis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the USB 2.0 accelerator show low utilization despite a fully INT8 10-million-parameter model and high peak compute capability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1240", "title": "Thermal Throttling on Qualcomm Cloud AI 100", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does P99 latency rise by 40% after 45 minutes at 75W in the restricted-airflow edge cabinet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1241", "title": "Hailo-8 Dataflow Topology Memory Bottleneck", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the lower-compute Model X run at 150 FPS while the higher-compute Model Y reaches 600 FPS on the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1242", "title": "Edge TPU Subgraph Partitioning and USB I/O Overhead", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does one unsupported Swish activation add 12ms of latency when the host CPU computes it in only 2ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1243", "title": "Analyzing Power Spikes on Google Coral Edge TPU", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU spike to 2W at 10 FPS, and what are the energy per inference and average power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1244", "title": "Hailo-8 Host-to-Device Streaming Latency Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is Hailo-8 inference latency 6.5 ms with only about 10% compute utilization for a 20-GOPS, 12MB-per-frame model?", "chain_ids": ["edge-chain-auto-secondary-010-01"], "chain_positions": {"edge-chain-auto-secondary-010-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1245", "title": "Unstructured Sparsity Latency Stagnation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 75% unstructured pruning reduce model storage but produce zero latency speedup on the Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1246", "title": "Asymmetric vs Symmetric Quantization Overhead", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does switching from symmetric to asymmetric INT8 quantization slow dense layers on the Cloud AI 100?", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 1}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1247", "title": "Hardware-Constrained Safety Guardrail Degradation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU silently stop logging the 30-GOPS guardrail at 60 FPS during peak activity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1248", "title": "Jetson Orin Roofline Bottleneck Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the roofline model, explain why the accelerator achieves only about 40.8 TOPS for batch-1 ViT inference despite a 275 TOPS peak.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1249", "title": "Unified Memory Bandwidth Starvation", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do four 8K60 streams drop frames on Jetson Orin despite low compute use, and how much LPDDR5 bandwidth do the explicit copies consume?", "chain_ids": ["edge-chain-auto-secondary-014-11"], "chain_positions": {"edge-chain-auto-secondary-014-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-11": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1250", "title": "Hailo-8 Data Streaming Bottleneck Analysis", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What TOPS does the Hailo-8 achieve on this layer, and why is it underutilized despite a 26 TOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1251", "title": "Edge TPU Thermal Throttling Analysis", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral USB accelerator drop from 30 FPS to 15 FPS, and what sustained power avoids throttling at 40°C ambient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1252", "title": "Edge TPU Sequence Scaling Bottleneck Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does increasing ViT sequence length from 196 to 784 tokens make Coral Edge TPU latency exceed 240ms and trigger host thrashing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1253", "title": "KV-Cache Budgeting on Cloud AI 100", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the batch-32, 2048-token INT8 13B LLM workload exceed the 32 GB memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1254", "title": "Shadow Deployment on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you shadow-deploy the 150 TOPS model without causing thermal throttling or OOMs in the 80 TOPS safety path?", "chain_ids": ["edge-chain-auto-secondary-011-02"], "chain_positions": {"edge-chain-auto-secondary-011-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1255", "title": "On-Device Fine-Tuning Checkpoint Architecture for Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect a gradient checkpointing strategy to fine-tune the ViT within the 32 GB memory and 60W TDP constraints?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 3}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1256", "title": "Adversarial Defense Architecture on Hailo-8 Dataflow Accelerator", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which real-time adversarial-patch defense would you deploy on the streaming NPU, and how would you fit it into its host-streamed dataflow?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1257", "title": "Dataflow Batching Architecture for Multi-Camera Edge Streams", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you use static or dynamic batching for the four 1080p streams on Hailo-8, and what scheduling policy meets a 15ms p99 SLA?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1258", "title": "On-Premise Continuous Learning Data Selection", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design on-device pruning and coreset selection on Cloud AI 100 while avoiding uplink overload and synthetic-data collapse?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1259", "title": "Data Quality Gates for High-Throughput On-Premise Inference", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect on-premise data validation and lineage tracking on Cloud AI 100 without starving the defect detectors?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1260", "title": "Active Learning Data Pipeline for Edge TPU Defect Detection", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you select 1,000 weekly images for annotation from Coral Edge TPU devices given INT8-only, limited-operator hardware?", "chain_ids": ["edge-chain-auto-secondary-003-23"], "chain_positions": {"edge-chain-auto-secondary-003-23": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1261", "title": "Edge Drift Detection Architecture on Coral TPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you detect data drift on Coral Edge TPUs under a 100 KB/s uplink and INT8-only operator constraints?", "chain_ids": ["edge-chain-auto-secondary-009-27"], "chain_positions": {"edge-chain-auto-secondary-009-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1262", "title": "Architecting Zero-Copy DMA for Multi-Camera Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you move preprocessed INT8 tensors from host memory to the USB Coral Edge TPU with minimal latency and CPU overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1263", "title": "Architecting Sequence Models for Hailo-8 Dataflow", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which sequence model architecture—encoder-only, decoder-only, or hybrid—best fits a host-streamed accelerator providing 26 TOPS of INT8 compute at 2.5W, and why?", "chain_ids": ["edge-chain-auto-secondary-011-26"], "chain_positions": {"edge-chain-auto-secondary-011-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1264", "title": "Architecting an Energy-Efficient Pipeline on Hailo-8", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the vision pipeline to minimize total energy per inference despite costly host DRAM streaming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1265", "title": "Architecting Extreme Quantization for Hailo-8 Streams", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you store sub-4-bit weights and execute on Hailo-8's INT8 datapath to relieve host-memory bandwidth?", "chain_ids": ["edge-chain-auto-secondary-006-02"], "chain_positions": {"edge-chain-auto-secondary-006-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1266", "title": "Architecting a Real-Time Vision Pipeline on Jetson Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you partition object detection and segmentation across Jetson Orin's Ampere GPU and DLA while managing 32 GB LPDDR5 bandwidth and the 60W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1267", "title": "Thermal and Bandwidth Graceful Degradation on Hailo-8", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What graceful degradation ladder would keep the Hailo-8 robot fail-operational when host thermal throttling cuts memory bandwidth preventing the execution of its full suite of models at 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-011-06"], "chain_positions": {"edge-chain-auto-secondary-011-06": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1268", "title": "Architecting Edge TPU INT8 Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you distill the FP32 teacher into an Edge TPU student that runs entirely in INT8 without CPU fallback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1269", "title": "Architecting a Vision-Language Pipeline on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you decompose the VLM pipeline so a 4B model meets the 150 ms sensor-to-control latency budget, and how do you map it to hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1270", "title": "PCIe Coral Edge TPU Cluster Routing", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route 200 FPS from 10 cameras across 4 PCIe Coral Edge TPUs while minimizing copies and respecting INT8-only execution?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1271", "title": "Architecting Multi-Tenant Memory-Mapped Inference for Edge", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you memory-map and share a dozen model weights to allow processes to switch lines without cold starts within 32 GB?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1272", "title": "Zero-DRAM Accelerator Host Memory Scheduling", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you manage host memory for the Hailo-8 cascaded 4-stream pipeline to avoid fragmentation, memory pressure, and OOM kills?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1273", "title": "Edge Video Analytics Pipeline Architecture", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect a pipeline to convert an FP32 PyTorch model to run on the USB Coral Edge TPU while preserving small-object accuracy?", "chain_ids": ["edge-chain-auto-secondary-011-28"], "chain_positions": {"edge-chain-auto-secondary-011-28": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1274", "title": "Architecting an INT8 Dataflow Pipeline for Hailo-8", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you convert and partition the vision model for Hailo-8 so unsupported ops do not create costly host round trips?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1275", "title": "Architecting Multi-Model Inference on Qualcomm Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you quantize and place the ASR, LLM, and TTS models on one Cloud AI 100 so they fit in 32 GB and stay within 75W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1276", "title": "Operator Scheduling Architecture on Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you statically schedule concurrent models on Cloud AI 100 to saturate 400 TOPS while reusing memory within 32 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1277", "title": "Coral Edge TPU Pipeline Bottleneck Analysis Design", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you profile the pipeline to locate whether latency comes from I/O, CPU pre/post-processing, or operator fallbacks?", "chain_ids": ["edge-chain-auto-secondary-010-02"], "chain_positions": {"edge-chain-auto-secondary-010-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1278", "title": "Architecting On-Premise Guardrails for PII Redaction", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you architect a fail-safe pipeline fitting PII, toxicity, and injection checks alongside the main generative model within the 32 GB memory constraint?", "chain_ids": ["edge-chain-auto-secondary-010-05"], "chain_positions": {"edge-chain-auto-secondary-010-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1279", "title": "Edge Video Storage Pipeline for NVIDIA Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you structure the on-device storage formats to ensure the 32 GB LPDDR5 memory is not bottlenecked while accommodating both the heavy video writes and metadata querying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1280", "title": "Zero-Copy Video Ingestion Pipeline for Hailo-8", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design host-side ingestion and memory management for 4x1080p at 30 FPS on a DRAM-less Hailo-8 to avoid bandwidth bottlenecks?", "chain_ids": ["edge-chain-auto-secondary-014-12"], "chain_positions": {"edge-chain-auto-secondary-014-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1281", "title": "Systolic Array Tiling Strategy for Google Coral Edge TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose weight- versus output-stationary tiling to keep the INT8 array busy without CPU fallbacks?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 2}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1282", "title": "Architecting Thermal Management for Unconditioned Edge Deployments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you manage scheduling and thermal headroom to meet burst-latency SLAs without throttling in a hot telco edge enclosure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1283", "title": "Architecting a Multi-Model Pipeline on Jetson Orin under VRAM Constraints", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate memory and execution on Jetson Orin so a 7B LLM and three vision models avoid OOMs under the 32 GB unified memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1284", "title": "Multi-Model Data Streaming Bottleneck on Hailo-8", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the Hailo-8 pipeline jumping from two 2 ms models to over 18 ms end-to-end latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1285", "title": "Diagnosing Power Throttling in Low-Compute Edge Models", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Cloud AI 100 hit its 75W power limit with only 15% INT8 compute utilization but nearly saturated memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1286", "title": "Diagnosing Edge TPU Compiler Graph Partitioning", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you identify the root cause of this compilation failure, and what constraints of the hardware architecture typically lead to this symptom?", "chain_ids": ["edge-chain-auto-secondary-006-28"], "chain_positions": {"edge-chain-auto-secondary-006-28": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1287", "title": "Hailo-8 Model Thrashing with mmap Weight Loading", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the mmap-based three-model pipeline stutter and drop below 1 TOPS even though each 15 MB model runs alone?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1288", "title": "Hailo-8 Host Bandwidth Exceeded Error Diagnosis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is host memory bandwidth exhausted and causing Hailo-8 timeouts even though the INT8 weights are only 15 MB?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1289", "title": "Diagnosing Latency Spikes on Edge TPUs", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause of this degradation on the Google Coral Edge TPU, and how would you diagnose it?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1290", "title": "Diagnosing Guardrail Context-Switching", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the hardware-level root cause of this system degradation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1291", "title": "Diagnosing Low Utilization on Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on the architecture of a systolic array accelerator like the DLA, what is the root cause of this poor utilization?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 2}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1292", "title": "Diagnosing Sudden Inference Latency Spikes on Hailo-8 Under Load", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause of the Hailo-8 throughput collapse in the 45°C enclosure?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1293", "title": "Canary Rollout Latency Spike on Edge TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this deployment failure?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1294", "title": "Adversarial Power Spikes on Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 70W power spikes and thermal throttling on identical inputs, and what are the security implications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1295", "title": "Diagnosing Latency Spikes in Dynamic Batching on Edge Accelerators", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does dynamic batching with max batch 16 and 10ms timeout cause 30ms latency and only 300 req/sec at 600 req/sec traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1296", "title": "Diagnosing MobileNetV3 CPU Fallback", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural features of this specific CNN design are causing this severe performance bottleneck on the Coral accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1297", "title": "Diagnosing Cost Overruns in Edge LLM Deployment", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the compute-only estimate fail, and what is the actual hardware bottleneck limiting performance to 13.6 tokens/second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1298", "title": "Coreset Memory Bottleneck on Dataflow Edge", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this underutilization?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1299", "title": "Host Data Validation Starving Hailo-8 Dataflow", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did adding host-side data validation drop throughput from 300 FPS to 45 FPS and Hailo-8 utilization to 15%?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1300", "title": "INT8 Calibration Dataset Bias at the Edge", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the INT8 Jetson Orin deployment miss pedestrians specifically at dusk and night despite strong FP32 performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1301", "title": "Latency Spikes During Edge Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FP32 KL/PSI drift monitor cause a 40% latency spike in the primary INT8 model on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1302", "title": "Diagnosing High Latency in Jetson Orin Unified Memory", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this bottleneck on this specific SoC architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1303", "title": "Diagnosing Power Throttling in Memory-Bound Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the Cloud AI 100 hitting its 75W TDP and throttling while INT8 compute utilization stays below 15%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1304", "title": "INT8 Quantization Impact on Equalized Odds", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hardware-mandated transformation caused this localized degradation, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1305", "title": "Diagnosing CPU Fallback on Edge TPU", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural issue causes 120ms latency and 100% host CPU utilization on the Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1306", "title": "Debugging Fallback OOM on Cloud AI 100", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this failure during the fallback transition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1307", "title": "Distilled INT8 Quantization Collapse", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What causes this severe accuracy degradation specific to the distilled model after INT8 quantization?", "chain_ids": ["edge-chain-auto-secondary-008-22"], "chain_positions": {"edge-chain-auto-secondary-008-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1308", "title": "Diagnosing OOM from Memory Fragmentation on AI 100", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of this failure?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1309", "title": "Diagnosing FP16 Overflow on Jetson Orin", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the FP16 TensorRT deployment drop from 85% to 55% mAP with collapsing bounding boxes?", "chain_ids": ["edge-chain-auto-secondary-011-30"], "chain_positions": {"edge-chain-auto-secondary-011-30": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1310", "title": "Diagnosing Operator Fallback on Qualcomm Cloud AI 100", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of the operator fallback on the Cloud AI 100, and how do you diagnose the specific coverage gaps?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1311", "title": "Diagnosing Intermittent Latency Spikes on Google Coral Edge TPU", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of these performance anomalies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1312", "title": "Edge TPU Multi-Camera Batching Evaluation", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use static batching of 4 frames or sequential batch-1 processing to minimize latency while meeting 120 FPS?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1313", "title": "Evaluating MobileNet vs ResNet on Hailo-8 Dataflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture do you choose for this specific hardware, and how do the platform's memory characteristics drive your decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1314", "title": "Heterogeneous Model Routing on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Between running both models sequentially on the Ampere GPU (Architecture A) or pinning the router to the DLA and the LLM to the GPU (Architecture B), which architecture is better and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1315", "title": "Coral Edge TPU Model Selection and Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which model, Alpha or Beta, is the most cost-effective choice for 4 cameras at 15 FPS on a Coral Edge TPU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1316", "title": "Evaluating Active Learning for Cloud AI 100", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy do you choose, and how do you justify the tradeoff between selection quality and operational feasibility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1317", "title": "Evaluating Drift Detection Architectures for High-Throughput Edge", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which design is better and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1318", "title": "Optimizing PCIe DMA for 4K Video on Cloud AI 100", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the Cloud AI 100 pipeline use pinned-memory async DMA or zero-copy PCIe access for 4K60 frames, and why?", "chain_ids": ["edge-chain-auto-secondary-008-18"], "chain_positions": {"edge-chain-auto-secondary-008-18": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1319", "title": "Encoder vs Decoder on Edge TPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the offline smart home hub use a MobileBERT-style encoder or a small quantized GPT-style decoder on the Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1320", "title": "Evaluating Operator Energy Tradeoffs on Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which model is more energy-efficient on the 2W Coral Edge TPU: fewer ops with 50M DRAM accesses or twice the ops fitting in SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1321", "title": "Evaluating INT8 Quantization Bias on Hailo-8", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you choose FP32 fairness plus overall INT8 accuracy (Pipeline A), or evaluate equalized odds on the Hailo-8 hardware (Pipeline B)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1322", "title": "Evaluating Edge Accelerators vs Traditional GPUs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture minimizes system-level latency and power for this pipeline, and how does the Hailo-8's lack of local DRAM alter your model optimization strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1323", "title": "Evaluating Degradation Strategies on Coral Edge TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach (Strategy A vs B) provides better fail-operational reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1324", "title": "Knowledge Distillation vs Pruning on Qualcomm Cloud AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you compress the 7B router with 80% structured pruning or distill it into a dense 1.4B student for Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1325", "title": "Evaluating Request Routing on Qualcomm Cloud AI 100", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the 8-card Cloud AI 100 cluster use model-ID consistent hashing or queue-depth weighted round-robin for request routing?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1326", "title": "Evaluating Memory-Mapped Weight Loading on Jetson Orin", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which approach optimizes cold start time and memory footprint on an NVIDIA Jetson Orin (32 GB LPDDR5), and what are the system tradeoffs?", "chain_ids": ["edge-chain-auto-secondary-008-25"], "chain_positions": {"edge-chain-auto-secondary-008-25": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1327", "title": "Evaluating Memory Management Strategies on Google Coral Edge TPU", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach provides better system stability and performance tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1328", "title": "Mixed-Precision Strategy for On-Premise LLM", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 20B LLM use W8A8 or W8A16, and how do memory bandwidth and accuracy drive the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1329", "title": "Operator Scheduling Tradeoffs on Jetson Orin GPU vs DLA", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which strategy do you choose to maximize throughput while staying within the 60W power budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1330", "title": "Cloud AI 100 I/O Bottleneck Evaluation", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which design better addresses the latency bottleneck and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1331", "title": "Evaluating Guardrail Deployment on NVIDIA Jetson Orin Edge Devices", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which guardrail architecture is better suited for this deployment, considering strict latency constraints and the Orin's 60W TDP limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1332", "title": "Format Selection for Host-Streamed Hailo-8 Inference", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the offline inference logs be stored as Zstd Parquet or uncompressed flat binary records to maximize accelerator utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1333", "title": "Audio Stream Pipeline for Coral Edge TPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which pipeline is better for latency and system stability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1334", "title": "Dataflow Tradeoffs on Qualcomm Cloud AI 100", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the ViT-L MLP layers, would you use a weight-stationary or output-stationary dataflow, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1335", "title": "Evaluating Thermal Strategies for Jetson Orin Deployment", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a continuous 150 TOPS workload at 45°C ambient, would you use active cooling or passive burst-and-idle scheduling, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1336", "title": "Shadow Deployment Memory Constraints on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given that V1 uses 6 GB of memory and V2 uses 7 GB, how much shared memory remains during the shadow rollout?", "chain_ids": ["edge-chain-auto-secondary-011-02"], "chain_positions": {"edge-chain-auto-secondary-011-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1337", "title": "Adversarial Defense Overhead on Hailo-8", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total latency and host memory read bandwidth for one frame with 3 perturbed 1080p inputs on the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1338", "title": "Hailo-8 Dataflow Batching Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If 4 synchronized cameras are statically batched on the Hailo-8, what is the theoretical minimum compute latency for the batch assuming 100% MAC utilization?", "chain_ids": ["edge-chain-auto-secondary-005-10"], "chain_positions": {"edge-chain-auto-secondary-005-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1339", "title": "Compound Pipeline Throughput on Cloud AI 100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the maximum theoretical throughput and energy per request for the sequential router-plus-expert pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1340", "title": "Hailo-8 Inference Throughput and Power Efficiency Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and frames per Joule for the 65 GOPS/frame Hailo-8 detector?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 1}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1341", "title": "Coreset Selection for INT8 Calibration", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How large is the 0.5% uncompressed calibration coreset, and does it fit in 32 GB of memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1342", "title": "Validation Compute Budget on Cloud AI 100", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much Cloud AI 100 compute remains for a pre-validation anomaly detection model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1343", "title": "Active Learning Budget on Coral Edge TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the daily labeling cost per device and Coral TPU compute fraction for the 50 MOPS uncertainty estimator at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1344", "title": "Drift Detection Latency Budget on Google Coral", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 15 ms inference in a 20 ms frame budget, is 10-bin PSI on the host CPU feasible and how much latency budget remains?", "chain_ids": ["edge-chain-auto-secondary-009-27"], "chain_positions": {"edge-chain-auto-secondary-009-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1345", "title": "PCIe DMA Overhead on Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the PCIe DMA transfer time for one uncompressed 1080p RGB INT8 frame, and is the system compute-bound or data-movement-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1346", "title": "Hailo-8 INT8 Energy Per Operation Math", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the average energy per INT8 operation at 26 TOPS and 2.5W, and how does it compare with an INT8 MAC baseline?", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1347", "title": "On-Device Demographic Parity Energy Budget", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the Demographic Parity difference and total energy consumed (in Watt-hours) if evaluating 1,000 records/sec at 60W TDP.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1348", "title": "Jetson Orin INT8 Compute Utilization for Vision Transformers", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What INT8 compute throughput is required for 30 FPS, and what is the theoretical minimum Jetson Orin utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1349", "title": "Distilling to INT8 for Google Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the Coral Edge TPU student distillation pipeline, and what theoretical max FPS does a 50 GOPs/frame student achieve?", "chain_ids": ["edge-chain-auto-secondary-008-19"], "chain_positions": {"edge-chain-auto-secondary-008-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1350", "title": "Load Balancing Requests Across Google Coral TPUs", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At 200 requests/sec with 50 GOPS/request, how much compute is required, can one Coral TPU handle it, and how should traffic be routed across 4 TPUs?", "chain_ids": ["edge-chain-auto-secondary-006-07"], "chain_positions": {"edge-chain-auto-secondary-006-07": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1351", "title": "Multi-Process mmap on Qualcomm Cloud AI 100", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For three INT8 8B LLM processes on 32 GB Cloud AI 100, what are the weight footprints with mmap versus independent loading and remaining memory?", "chain_ids": ["edge-chain-auto-secondary-008-26"], "chain_positions": {"edge-chain-auto-secondary-008-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1352", "title": "Host Buffer Sizing for Dataflow Edge Streaming", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much pinned host memory is needed to buffer enough 6 MB frames for a 50 ms stall with 4 MB contiguous pages?", "chain_ids": ["edge-chain-auto-secondary-011-08"], "chain_positions": {"edge-chain-auto-secondary-011-08": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1353", "title": "Estimating ResNet-50 Inference Speed on Coral Edge TPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the Coral Edge TPU's maximum theoretical FPS for an 8 GMAC/image model, and what precision constraint must the team understand?", "chain_ids": ["edge-chain-auto-secondary-011-28"], "chain_positions": {"edge-chain-auto-secondary-011-28": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1354", "title": "Dual-Branch Network Scheduling on Qualcomm AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the peak SRAM footprints for the A-then-B and B-then-A branch schedules, and which order minimizes spilling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1355", "title": "USB I/O Bottleneck on Coral Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum per-frame latency including TPU compute and USB transfer, and what is the primary bottleneck?", "chain_ids": ["edge-chain-auto-secondary-010-02"], "chain_positions": {"edge-chain-auto-secondary-010-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1356", "title": "Guardrail Memory Footprint on Cloud AI 100", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total INT8 weight memory do the guardrails and 24B model require, and how much of the 32 GB remains for KV cache and activations?", "chain_ids": ["edge-chain-auto-secondary-010-05"], "chain_positions": {"edge-chain-auto-secondary-010-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1357", "title": "Edge Telemetry Storage Sizing on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 3 GB uncompressed log buffer last, and what is the hourly Parquet disk footprint with 4x compression if you switch to a columnar format?", "chain_ids": ["edge-chain-auto-secondary-008-31"], "chain_positions": {"edge-chain-auto-secondary-008-31": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1358", "title": "Hailo-8 Multi-Camera Ingestion Bandwidth", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What continuous host-to-Hailo-8 bandwidth is required to stream six uncompressed 1080p RGB cameras at 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-014-12"], "chain_positions": {"edge-chain-auto-secondary-014-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1359", "title": "Google Coral INT8 Throughput and Energy Calculation", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum IPS and energy per inference for the 200M-MAC INT8 model on the 4 TOPS Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1360", "title": "Calculate Sustained TOPS Under Edge Thermal Throttling", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming a linear relationship between power draw and compute throughput when throttled, with passive cooling limited to 60W, what maximum sustained INT8 TOPS can the Cloud AI 100 deliver?", "chain_ids": ["edge-chain-auto-secondary-006-17"], "chain_positions": {"edge-chain-auto-secondary-006-17": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1361", "title": "Active Learning Storage on Qualcomm Cloud AI 100", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If 0.05% of processed frames trigger the filter and each requires 1.5 MB, how much daily storage is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1362", "title": "Edge Guardrail Memory and Latency Sizing", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum INT8 safety model parameter count in the remaining memory, and its theoretical latency for a 100-token sequence at 20% TOPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1363", "title": "Hailo-8 Host Memory Bandwidth for Data Logging", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum logging FPS can the host sustain for uncompressed row frames versus 4x-compressed TFRecord with 200 MB/s overhead?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1364", "title": "Shadow Deployment Sizing on Hailo-8", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 14 TOPS reserved for Model A, what maximum shadow FPS can Model B run at without exceeding 26 TOPS, and what is the estimated total power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1365", "title": "Jetson Orin GPU vs DLA Efficiency", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and Joules per frame for DLA-only versus GPU-only deployment of the 250 GOPS/frame model?", "chain_ids": ["edge-chain-auto-secondary-008-11"], "chain_positions": {"edge-chain-auto-secondary-008-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1366", "title": "Activation Spilling Latency on Dataflow Accelerators", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What latency penalty does spilling and reloading the 256x256x64 INT8 activation add over a 4 GB/s host interface?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1367", "title": "Randomized Smoothing on Edge TPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At exactly 20 FPS, what maximum randomized smoothing pass count N can the Coral run, and what energy is consumed per smoothed frame?", "chain_ids": ["edge-chain-auto-secondary-007-20"], "chain_positions": {"edge-chain-auto-secondary-007-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1368", "title": "Stream Bandwidth for Depthwise Convs on Hailo-8", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What host memory streaming bandwidth is required to feed the 100x100x64 depthwise-plus-pointwise layer at 10 TOPS utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1369", "title": "Throughput and Efficiency on Google Coral Edge TPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and inferences per Joule for a 2 GOPs/forward-pass INT8 model on a 4 TOPS, 2W accelerator?", "chain_ids": ["edge-chain-auto-026-15"], "chain_positions": {"edge-chain-auto-026-15": 0}, "chain_tiers": {"edge-chain-auto-026-15": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1370", "title": "Compute-Bound Data Pipeline Throughput", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What data pipeline throughput in GB/s is needed to keep the 400 TOPS Cloud AI 100 fully utilized on 0.5 TOPS/frame 1080p inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1371", "title": "Image Validation Gate Throughput Calculation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 0.2% of Jetson Orin's 275 TOPS allocated, what maximum FPS can validate 10-megapixel frames at 200 INT8 ops per pixel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1372", "title": "Edge Drift Detection Overhead", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the memory footprint in bytes and operation count required per check to calculate the KL divergence of two 256-bin FP32 histograms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1373", "title": "DMA Transfer Latency Calculation for 4K Video Batches", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact PCIe data movement latency for one batch of 8 uncompressed 4K RGB frames over the 12 GB/s link?", "chain_ids": ["edge-chain-auto-secondary-008-18"], "chain_positions": {"edge-chain-auto-secondary-008-18": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1374", "title": "Encoder-Decoder Latency on Edge TPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum latency to generate 10 tokens with a 10 GOPS encoder and 2 GOPS per decoder token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1375", "title": "Energy Cost of INT8 MACs on Google Coral", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the energy per INT8 MAC in pJ and total compute energy per inference for 10 billion MACs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1376", "title": "4-Bit Weight Packing for Coral Edge TPU Bandwidth", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If your device interface has an effective memory bandwidth of 8 GB/s and the model contains 32 million parameters, what is the maximum memory-bound inference rate after packing 32M 4-bit weights over an 8 GB/s interface?", "chain_ids": ["edge-chain-auto-secondary-006-04"], "chain_positions": {"edge-chain-auto-secondary-006-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1377", "title": "Hailo-8 Fairness Evaluation Energy & Disparity", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the Equal Opportunity TPR disparity, theoretical FPS, and total accelerator energy for the 13,000-frame validation run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1378", "title": "Hailo-8 Compute Bound FPS Calculation", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum compute-bound FPS for a 52 GOPS model on a 26 TOPS Hailo-8 at 80% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1379", "title": "Calculate Fallback Model Size for Coral TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming 50% hardware utilization of peak compute, what is the maximum GOps per frame a fallback INT8 model can use to maintain 20 FPS under the 1W, 2 TOPS thermal cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1380", "title": "Compiler Graph Break Latency Overhead", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the effective host bandwidth for streaming is 2.0 GB/s in each direction, what is the total latency overhead added purely by this failure to fuse operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1381", "title": "Sizing the Distilled INT8 Student Model", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the largest INT8 student model that can sustain 10,000 tokens/s at 50% of 400 TOPS, and how much memory remains for KV cache?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1382", "title": "Weighted Round-Robin for Asymmetric Edge Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What weighted round-robin integer weights should the four AI 100 cards use, and what is the maximum theoretical throughput?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1383", "title": "Calculate Maximum Batch Size for LLM on Qualcomm Cloud AI 100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum batch size fits in 32 GB when weights use 20 GB, runtime uses 2 GB, and KV cache uses 1.5 GB per sequence?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 0}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1384", "title": "Multi-Process Memory-Mapped Inference Footprint", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the total memory footprints for four private 7B INT8 model loads versus shared read-only mmap weights, and will either OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1385", "title": "INT8 Compute Throughput on Cloud AI 100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum theoretical inference throughput for the 800 GOPS model on the 400 TOPS INT8 hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1386", "title": "CI/CD Validation Latency and Energy on Cloud AI 100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical minimum validation time and accelerator energy for 1,000,000 samples at 500 GOPS each on a 400 TOPS, 75W accelerator?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1387", "title": "Edge TPU Operator Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical minimum latency when 90% of a 2B-MAC model runs on the 4 TOPS Edge TPU and 10% on a 0.1 TOPS CPU sequentially?", "chain_ids": ["edge-chain-auto-secondary-006-12"], "chain_positions": {"edge-chain-auto-secondary-006-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1388", "title": "LLaMA 7B INT8 Memory Footprint on Jetson Orin", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What memory footprint do the 7B INT8 weights require, and how much of the Jetson Orin's 32 GB unified memory remains?", "chain_ids": ["edge-chain-auto-secondary-003-27"], "chain_positions": {"edge-chain-auto-secondary-003-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1389", "title": "Alerting on Host-to-Device Streaming Starvation", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At only 300 MB/s PCIe bandwidth, what frame drop rate should monitoring expect for uncompressed 1080p RGB input at a 60 FPS target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1390", "title": "Hardware-Aware NAS Latency Bound on Qualcomm Cloud AI 100", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum compute-bound latency should the NAS controller estimate for an 800 GOPs INT8 model on a 400 TOPS accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1391", "title": "Compute Peak Memory for Sequential Operators", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the absolute peak activation memory footprint for the 3GB, 5GB, and 2GB sequential tensors, and what formula gives it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1392", "title": "Calculate Theoretical Minimum Compute Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming 100% compute utilization, what is the theoretical minimum compute latency per frame for an 800 GOPS INT8 inference on a 400 TOPS Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1393", "title": "Unstructured Sparsity Bandwidth on Hailo-8", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What weight bandwidth at 60 FPS is required for the dense 4M INT8 layer versus 50% CSR sparsity with 16-bit indices, and does CSR help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1394", "title": "Calculate thermal throttling impact on Orin throughput", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum FPS should you expect when the Jetson Orin is throttled from 60W to 25W assuming linear scaling?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1395", "title": "Zero-Copy Micro-Batching on Hailo-8 Architecture", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Because Hailo-8 is a dataflow accelerator with no local DRAM, how would you batch and schedule the detector and license plate reader to minimize PCIe contention and meet the 33ms detector SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1396", "title": "On-Device Coreset Selection for Edge Continual Learning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the AI 100 active data selection and coreset pipeline to maximize ICR while sending only a tiny fraction of 60 FPS video for labeling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1397", "title": "Zero-Copy Video Pipeline for Coral Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a zero-copy pinned-memory DMA pipeline from camera to Coral to meet the 33 ms 30 FPS deadline?", "chain_ids": ["edge-chain-auto-secondary-008-17"], "chain_positions": {"edge-chain-auto-secondary-008-17": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1398", "title": "Cross-Silo FL Model Aggregation on Qualcomm AI 100", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you size LoRA fine-tuning for the 3B model on a 32 GB AI 100 and minimize daily synchronization bandwidth across 50 hospitals?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1399", "title": "Automotive ASIL-D Certification for Lidar Perception on Jetson Orin", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What Orin architecture and architectural choices would provide deterministic ASIL-D fault detection within 50 ms for transient hardware faults and software lockups?", "chain_ids": ["edge-chain-auto-secondary-008-02"], "chain_positions": {"edge-chain-auto-secondary-008-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1400", "title": "Thermal Sizing for Fanless Hailo-8 Edge Camera", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the 13 TOPS, 4 GB/s Hailo-8 vision pipeline thermally feasible in the 8W fanless enclosure, and what changes are needed?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1401", "title": "Edge TPU Precision Requirements for Efficient CNNs", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific numerical precision format is strictly required by the Google Coral Edge TPU for hardware acceleration?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1402", "title": "Hailo-8 Local Memory Specification", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When designing the memory buffer for these intermediate states, how much local DRAM does the accelerator provide to store these chained inference outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1403", "title": "Qualcomm Cloud AI 100 Specifications", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the baseline INT8 throughput, memory capacity, and power consumption of a single Qualcomm Cloud AI 100 card?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1404", "title": "Hailo-8 Local DRAM Memory Architecture", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the specific capacity of local DRAM on the Hailo-8?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1405", "title": "Jetson Orin Zero-Copy Memory Architecture Identification", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What architectural feature makes zero-copy data movement possible, and what memory allocation technique is required to leverage it?", "chain_ids": ["edge-chain-auto-secondary-008-15"], "chain_positions": {"edge-chain-auto-secondary-008-15": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1406", "title": "Edge TPU Precision Requirements for Bias Evaluation", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the mandatory numerical precision required for the Google Coral Edge TPU, and what is its peak performance per watt?", "chain_ids": ["edge-chain-auto-secondary-009-28"], "chain_positions": {"edge-chain-auto-secondary-009-28": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1407", "title": "Analyzing Edge TPU Throughput Bottlenecks", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does a 20 GOPS Edge TPU model achieve only 40 FPS with high host CPU utilization despite a 200 FPS compute ceiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1408", "title": "KV Cache OOM Analysis on Cloud AI 100", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the 7B INT8 LLM OOM on the 32 GB Cloud AI 100 at batch 64 and sequence length 2048 despite weights using only 7 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1409", "title": "Thermal Throttling of Adversarial Defenses", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does adding 5 ms randomized input smoothing drop the Jetson Orin workload to 20 FPS instead of the expected ~28 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1410", "title": "Dynamic Batching Latency Spikes on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does dynamic batching to batch size 4 with a 30 ms timeout raise p99 latency from 15 ms to 50 ms on the 120 FPS pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1411", "title": "Explaining Latency Regressions in Depthwise Convolutions on Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can replacing dense convolutions with depthwise separable convolutions slow the Cloud AI 100 despite an 8x FLOP reduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1412", "title": "Edge TPU Pipeline Throughput Degradation Analysis", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the two-stage Edge TPU pipeline reach about 40 ms latency with 90% CPU utilization, and what is the true end-to-end latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1413", "title": "Compute Efficiency and Power Limits on Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Jetson Orin hit its 60W power limit before reaching its 275 TOPS compute limit, and what is the energy cost per inference?", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 1}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1414", "title": "Information-Compute Ratio on Coral TPU", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the Coral TPU system's Information-Compute Ratio collapse at 120 FPS despite fitting within the 4 TOPS limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1415", "title": "Host Preprocessing Bottleneck on Hailo-8", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the Hailo-8 dataflow accelerator starved, and what bottleneck latency does the pipeline equation imply at 12 FPS?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 1}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1416", "title": "INT8 Clipping in Edge Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the Coral INT8 anomaly gate flag 40% of valid bright frames as anomalous during peak sunlight?", "chain_ids": ["edge-chain-auto-secondary-009-23"], "chain_positions": {"edge-chain-auto-secondary-009-23": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1417", "title": "Data Selection Bias in High-Throughput Edge Inference", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did sampling only 2 FPS for curation cause the 60 FPS Hailo-8 deployment to fail despite strong validation metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1418", "title": "Drift Detection I/O Bottleneck on Dataflow Edge", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does streaming a 3MB intermediate feature map to the host cap the Hailo-8 pipeline at 200 FPS despite low compute use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1419", "title": "Hailo-8 DMA PCIe Streaming Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does this architecture experience this specific performance wall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1420", "title": "Encoder-Decoder Latency Disparity on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the encoder run in 10ms while the autoregressive decoder takes 250ms on the same Jetson Orin hardware?", "chain_ids": ["edge-chain-auto-secondary-011-27"], "chain_positions": {"edge-chain-auto-secondary-011-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1421", "title": "Energy Cost Analysis of Memory vs Compute on Jetson Orin", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given the Horowitz energy table principles, why does memory-bound Layer B draw 45W at only 10% TOPS utilization while dense Layer A draws 20W at 80% utilization?", "chain_ids": ["edge-chain-auto-secondary-005-12"], "chain_positions": {"edge-chain-auto-secondary-005-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1422", "title": "Analyzing 3-bit GPTQ Latency Degradation on Jetson Orin", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can 3-bit GPTQ make the 14B model slower and more power-hungry on Jetson Orin than standard INT8 despite fitting in memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1423", "title": "Analyzing INT8 Quantization Disparities on Cloud AI 100", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does pure INT8 raise one subgroup's false rejection rate, and what throughput cost comes from keeping the final 20% of compute in FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1424", "title": "Analyzing Federated Averaging Bottlenecks on Jetson Orin", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does synchronization, not local training, bottleneck the Jetson Orin federated system with 8GB updates, and how would you reduce it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1425", "title": "Batch Size Impact on Compute Utilization", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Cloud AI 100 utilization jump from 40 sustained TOPS at batch size 1 to 320 TOPS at batch size 32?", "chain_ids": ["edge-chain-auto-secondary-007-23"], "chain_positions": {"edge-chain-auto-secondary-007-23": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1426", "title": "Power-Constrained Degradation on Jetson Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the robot switch to a 40 TOPS DLA-only model at 15 FPS instead of running the 160 TOPS GPU model at a lower framerate under the 25W limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1427", "title": "SRAM Spilling in Fused Operators on Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does fusing the convolution and GeLU make latency worse, and what latency penalty does the 4MB SRAM spill add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1428", "title": "Kernel Fusion on Heterogeneous SoCs", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does fusing the DLA convolution with the custom GPU activation degrade latency even though it removes 20MB of LPDDR5 traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1429", "title": "Distillation Arithmetic Intensity on Hailo-8", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the 1 GOP distilled student slower than the 5 GOP pruned teacher on a DRAM-less Hailo-8 with a 4 GB/s PCIe link?", "chain_ids": ["edge-chain-auto-secondary-008-21"], "chain_positions": {"edge-chain-auto-secondary-008-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1430", "title": "Edge LLM Latency Breakdown on AI 100", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the end-to-end latency for the 500-token prompt and 100-token response, and why does 400 TOPS mainly help prefill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1431", "title": "Hailo-8 Host Memory Bandwidth Imbalance", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does round-robin routing create latency spikes across two identical Hailo-8 chips for mixed-resolution video streams?", "chain_ids": ["edge-chain-auto-secondary-006-09"], "chain_positions": {"edge-chain-auto-secondary-006-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1432", "title": "Hailo-8 Host Memory Streaming Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Hailo-8 reach only about 15 FPS for the 50 GOP model despite needing only 3 TOPS at 60 FPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1433", "title": "Coral Edge TPU mmap Initialization Over USB", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does host mmap fail to give zero-copy Coral initialization, and why is each 24MB model cold start exactly 400ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1434", "title": "OOM During Gradient Accumulation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Jetson Orin fine-tuning job still hit OOM at the end of the final micro-batch despite 4-way gradient accumulation?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1435", "title": "Hailo-8 CI/CD Pipeline Bandwidth Bottleneck", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Hailo-8 stay under 2% compute utilization yet cap at 50 FPS on 4MB input frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1436", "title": "TensorRT Graph Fragmentation on Jetson Orin", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do three DLA-unsupported ONNX operators disproportionately hurt Jetson Orin latency, and what is the new effective throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1437", "title": "Thermal Throttling on Jetson Orin GPU vs DLA", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving some layers from the Ampere GPU to the DLA let the 200 TOPS vision pipeline run without thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1438", "title": "Edge TPU Quantization and Bottleneck Analysis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral reject the 8M-parameter FP32 model, and what are the FP32 and INT8 memory footprints that make USB I/O dominate?", "chain_ids": ["edge-chain-auto-secondary-003-29"], "chain_positions": {"edge-chain-auto-secondary-003-29": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1439", "title": "Power Throttling Stragglers on Cloud AI 100", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do P99 latency stragglers appear when the Cloud AI 100 hits 75W, and what latency increase results from a 20% compute throttle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1440", "title": "NAS Memory-Bound Analysis on Hailo-8", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does hardware-aware NAS prefer a 4 GOPS, 4M-parameter model over a 2 GOPS, 20M-parameter model on Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1441", "title": "Edge TPU Subgraph Partitioning Transfer Overhead", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does inserting an unsupported Swish between two TPU Conv2Ds cause high latency, and what USB transfer penalty does the 1MB feature map create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1442", "title": "A/B Partition OTA Power Throttling on AI 100", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the accelerator throttle INT8 inference during background OTA flash programming even though weights are already in memory?", "chain_ids": ["edge-chain-auto-027-18"], "chain_positions": {"edge-chain-auto-027-18": 0}, "chain_tiers": {"edge-chain-auto-027-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1443", "title": "Explain Thermal Throttling on Google Coral Edge TPU", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Coral Edge TPU inference latency double after 15 minutes of continuous 4 TOPS operation in a passively cooled enclosure?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1444", "title": "Analyzing I/O Latency Bottlenecks on Hailo-8", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the end-to-end latency about 7ms when the Hailo-8 compute time is only 1ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1445", "title": "Unstructured Sparsity Inefficiency on Dense Accelerators", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 80% unstructured pruning fail to improve latency on the dense Cloud AI 100 INT8 accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1446", "title": "Per-Channel INT8 Overhead on AI 100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does per-channel INT8 quantization slightly slow the Cloud AI 100, and how much extra metadata is needed for a linear layer with 4096 output channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1447", "title": "Coral Edge TPU Frame Budget Analysis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the Coral Edge TPU pipeline miss the 16.6ms frame budget even though the TPU runs the model in 8ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1448", "title": "Quantization Bias in TPU Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why did deploying the toxicity guardrail on Coral cause a minority-dialect fairness failure, and what latency and energy costs come with CPU FP32?", "chain_ids": ["edge-chain-auto-secondary-010-04"], "chain_positions": {"edge-chain-auto-secondary-010-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1449", "title": "Jetson Orin Roofline Memory Bound Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the roofline model, why does a 100 OP/byte INT8 model reach only about 7% compute utilization on Jetson Orin?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 2}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1450", "title": "Watchdog Resets Under Thermal Throttling", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the BIST watchdog reset occur under peak thermal load even though INT8 inference still meets its 20ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1451", "title": "Row vs Columnar Storage Latency and Throughput Tradeoffs", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does compressed Parquet hurt batch-1 latency but boost batch-4096 throughput, and what is the memory read reduction per batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1452", "title": "Unified Memory Contention in High-Frequency Edge Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do latency spikes and frame drops appear at 60Hz LiDAR ingestion despite 30% accelerator utilization and 40W power draw?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1453", "title": "Streaming Dataflow Bottlenecks on Edge Accelerators", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the accelerator reach only 0.2 TOPS on this 100 MOPS, 2 MB layer despite a significantly higher peak capability?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1454", "title": "Analyzing Thermal Throttling on Google Coral Edge TPU", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU drop from 30 FPS to 15 FPS after 15 minutes at 40°C, and what thermal tradeoff is occurring?", "chain_ids": ["edge-chain-auto-secondary-006-16"], "chain_positions": {"edge-chain-auto-secondary-006-16": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1455", "title": "Edge TPU Attention Cost Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the 256-token transformer's attention fall back to CPU on the Edge TPU, and how would you keep it on INT8 hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1456", "title": "KV Cache Scaling on Qualcomm Cloud AI 100", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 7B INT8 LLM OOM at batch size 14 with a 4096-token context on a 32 GB Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1457", "title": "Shadow Deployment Architecture on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you run a shadow vision model on Jetson Orin without adding production latency or exceeding the 32 GB, 60W limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1458", "title": "On-Premise Video Analytics Accelerator Selection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use multiple T4-class GPUs or one Cloud AI 100 for 50 1080p streams under a 150W budget, and why?", "chain_ids": ["edge-chain-auto-secondary-008-14"], "chain_positions": {"edge-chain-auto-secondary-008-14": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1459", "title": "On-Device Fine-Tuning Memory Strategy", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you use freezing and gradient checkpointing to fine-tune on Jetson Orin without OOMing 32 GB shared memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1460", "title": "Adversarial Defense Architecture for Streaming Edge Accelerators", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which adversarial-patch defense would you choose for Hailo-8, and how would it meet real-time bandwidth and compute limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1461", "title": "Hailo-8 Multi-Stream Dataflow Batching Architecture", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What scheduling and batching policy would you use on one Hailo-8 to keep four 30 FPS streams under a 15 ms latency budget while maintaining the required 120 FPS combined throughput?", "chain_ids": ["edge-chain-auto-secondary-005-10"], "chain_positions": {"edge-chain-auto-secondary-005-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1462", "title": "On-Premise RAG Pipeline Design for Cloud AI 100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the Cloud AI 100 RAG pipeline, would you co-locate the embedding, ranker, and 8B LLM or swap them sequentially to meet 1s TTFT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1463", "title": "Multi-Model Drone Inspection Design on Hailo-8", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the true system-level cost of running four 15 GOPS/frame networks at 30 FPS on Hailo-8, and how would you avoid bandwidth stalls?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 3}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1464", "title": "Architecting Data Pruning for Edge Continual Learning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you build a streaming coreset pipeline to select from 10 million daily logs under the 75W limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1465", "title": "Active Learning Curation for Edge TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you curate field data from Coral Edge TPU cameras to improve INT8 accuracy without exceeding edge bandwidth limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1466", "title": "Coral TPU Edge Drift Detection Architecture", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you detect lighting or sensor drift on the system without exceeding the 2W budget or relying on unsupported operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1467", "title": "Zero-Copy Video Pipeline Architecture for Google Coral Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign the 1080p60 Coral USB data pipeline to fix 100% CPU use while the Edge TPU sits at 30% utilization?", "chain_ids": ["edge-chain-auto-secondary-008-17"], "chain_positions": {"edge-chain-auto-secondary-008-17": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1468", "title": "Architecting Real-time Translation on Hailo-8", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which speech-to-text architecture would you choose for this device, and how would you handle decoding under its streaming constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1469", "title": "Architecting Energy-Efficient Streams for Hailo-8", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Architect the data ingestion and model execution flow for four 1080p streams to minimize host memory access energy per operation.", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1470", "title": "Sub-4-bit Weight Streaming Architecture for Hailo-8", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you use sub-4-bit host-side weight storage while still feeding INT8 weights efficiently to the accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1471", "title": "Architecting On-Device Fairness Evaluation for Edge Vision", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you run continuous on-device fairness evaluation without sending video off-device or exceeding 32 GB and 60W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1472", "title": "Federated Learning Architecture on Hailo-8 Dataflow Accelerators", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design on-device federated learning with Hailo-8 nodes given host streaming bottlenecks and non-IID driver data?", "chain_ids": ["edge-chain-auto-017-02"], "chain_positions": {"edge-chain-auto-017-02": 1}, "chain_tiers": {"edge-chain-auto-017-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1473", "title": "Optimizing Sensor Fusion Kernels on Jetson Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you structure CUDA thread blocks, warps, and shared memory tiling on Jetson Orin to avoid LPDDR5 bandwidth saturation?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1474", "title": "Hailo-8 Host Bandwidth Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Design a fail-operational graceful degradation ladder that maintains checkout accuracy without crashing the dataflow pipeline.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1475", "title": "Multi-Model Compilation Strategy for NVIDIA Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you map the detector, planner, and fusion graphs across Jetson Orin's GPU and DLAs within the 60W and 32 GB limits?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1476", "title": "Architecting Knowledge Distillation for Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you distill the float32 teacher into a Coral-compatible INT8 student while respecting Edge TPU operator limits?", "chain_ids": ["edge-chain-auto-secondary-008-19"], "chain_positions": {"edge-chain-auto-secondary-008-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1477", "title": "Architecting an Object Detection Pipeline on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate the 30 ms latency budget for 4K60 drone vision on Jetson Orin across preprocessing, inference, postprocessing, and control?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1478", "title": "Multi-TPU Routing for High-Frequency Industrial Inspection", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route 8 concurrent 60 FPS streams across four Coral Edge TPUs while minimizing latency and PCIe contention?", "chain_ids": ["edge-chain-auto-secondary-006-07"], "chain_positions": {"edge-chain-auto-secondary-006-07": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1479", "title": "Edge TPU Multi-Model Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which models would you keep resident in Coral Edge TPU SRAM, and when would you swap over USB for the motion/person/face cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1480", "title": "Architecting Multi-Model Memory-Mapped Inference on Cloud AI 100", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design memory-mapped model sharing for 5 CV models across 20 streams on a 32 GB appliance?", "chain_ids": ["edge-chain-auto-secondary-008-26"], "chain_positions": {"edge-chain-auto-secondary-008-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1481", "title": "Handling Host Memory Pressure for Hailo-8 Streams", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you manage host memory for Hailo-8 DMA under fragmentation and daemon spikes in a 4 GB smart-city appliance?", "chain_ids": ["edge-chain-auto-secondary-011-08"], "chain_positions": {"edge-chain-auto-secondary-011-08": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1482", "title": "Hailo-8 Multi-Model Pipeline Design and Host Memory Offloading", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you fuse the detector, pose, and action models for Hailo-8 or deploy separate ONNX graphs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1483", "title": "Architecting a Multi-Model Pipeline on Qualcomm Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you lay out memory and schedule detector, tracker, and transformer Re-ID execution on Cloud AI 100 for batches up to 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1484", "title": "Multi-Tenant Operator Scheduling on Qualcomm Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule multiple transformer models on Cloud AI 100 to maximize 400 INT8 TOPS without LPDDR4x memory thrashing?", "chain_ids": ["edge-chain-auto-secondary-008-29"], "chain_positions": {"edge-chain-auto-secondary-008-29": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1485", "title": "Architecting a Resilient OTA Update System for Jetson Orin", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design OTA updates for Jetson Orin robots to allow zero-downtime downloads and automatic rollback after boot failure?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 4}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1486", "title": "Architecting Low-Latency Profiling for Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you profile Coral Edge TPU latency spikes to separate USB bandwidth, CPU-transfer, and operator-fallback bottlenecks without significantly impacting the device's 2W power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1487", "title": "Architecting Structured Sparsity on Jetson Orin", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Would you use 80% unstructured pruning for this edge model, and what hardware-aware sparsity strategy would you deploy?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 3}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1488", "title": "Designing a Hybrid Quantization Strategy for Jetson Orin", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose PTQ vs QAT, calibration granularity, and GPU/DLA mapping for a 7B multimodal pipeline on Jetson Orin?", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 2}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1489", "title": "On-Premise Guardrail Architecture for Legal Summarization", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate hardware resources between the main summarization model and guardrails to meet a 2-second end-to-end latency SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1490", "title": "Edge Sensor Data Ingestion Pipeline Architecture on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What storage format and compression strategy would you use on Jetson Orin to handle 1.5 GB/s sensor ingress without starving inference?", "chain_ids": ["edge-chain-auto-secondary-008-31"], "chain_positions": {"edge-chain-auto-secondary-008-31": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1491", "title": "Real-time Multi-Camera Pipeline Architecture for Hailo-8 Dataflow Accelerator", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Architect the streaming ingestion pipeline to handle frame capture, preprocessing, and continuous tensor streaming to the Hailo-8 without starving it.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1492", "title": "Architecting Dataflow for Google Coral Edge TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What tiling and dataflow strategy would you use to optimize a high-resolution CNN for the Coral Edge TPU's INT8 systolic array?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1493", "title": "Architecting Thermal Management for Qualcomm Cloud AI 100", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you balance active cooling, DVFS, and workload shedding to sustain throughput at 45°C on the 75W accelerator?", "chain_ids": ["edge-chain-auto-secondary-006-17"], "chain_positions": {"edge-chain-auto-secondary-006-17": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1494", "title": "Architecting Unified Memory for Concurrent Edge Models", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you budget the Jetson Orin's 32 GB shared memory so the 14B LLM and ViT never OOM during concurrent inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1495", "title": "Architecting a Multi-Camera Perception System for Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What CNN architecture and GPU/DLA mapping would you use for six 4K 30 FPS streams on Jetson Orin without becoming memory-bound?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1496", "title": "Hailo-8 Dataflow Optimization and Memory-Bound Operator Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the fusion strategy for these memory-bound operations to minimize latency and host memory access on the Hailo-8?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 2}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1497", "title": "Edge TPU Fleet CI/CD Architecture", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you build the CI/CD pipeline to quantize, compile, validate, and safely OTA-deploy Coral Edge TPU model updates?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 2}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1498", "title": "Hardware-Aware NAS for Edge TPU Real-Time Inference", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS for Coral Edge TPU that meets 30 FPS, 2W, INT8, and operator-compatibility constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1499", "title": "Architecting a Streaming Pipeline for Hailo-8", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you use the roofline model to evaluate the tradeoffs of which operations execute on the Hailo-8 versus the host, and how does the lack of local DRAM constrain your design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1500", "title": "Jetson Orin Unified Memory Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Given the Orin's architecture and its 60W TDP budget, what is the most likely root cause of this latency bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1501", "title": "Latency Spikes in Conditional Privacy Guardrails on Hailo-8", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hardware architecture constraint causes these extreme latency spikes despite low compute utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1502", "title": "Real-Time Video Ingestion Bottleneck on Cloud AI 100", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this poor accelerator utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1503", "title": "Diagnosing DLA Memory Bottlenecks on Jetson Orin", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the likely architectural root cause of this severe memory bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1504", "title": "Diagnosing Sustained Throughput Drop on Hailo-8", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on the hardware architecture, what is the root cause of this performance collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1505", "title": "Canary Rollout CPU Fallback Degradation", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this deployment failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1506", "title": "Diagnosing Power Anomalies from Adversarial Energy Attacks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the Cloud AI 100 to hit its 75W TDP and throttle despite unchanged input resolution and frame rate?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1507", "title": "Diagnosing Throughput Collapse with Dynamic Batching on Cloud AI 100", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing this throughput collapse despite the high incoming request rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1508", "title": "Diagnosing CPU Fallback in EfficientNet on Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural feature of standard EfficientNet causes this fallback, and how is it resolved?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1509", "title": "Diagnosing Host Bottlenecks with Synthetic Data on Hailo-8", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What system-level bottleneck is leaving the accelerator idle while the host CPU is at 100% after synthetic-data-style preprocessing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1510", "title": "Hailo-8 Host Memory Streaming Validation Contention", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of these anomalies in the data pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1511", "title": "Diagnosing INT8 Quantization Failures from Biased Calibration", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What calibration dataset curation failure caused the Jetson Orin INT8 model to fail at night while FP32 still works?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1512", "title": "Feature Map Drift Detection Power Throttling", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the power spike and thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1513", "title": "Diagnosing OOM Errors in Edge Encoder-Decoder", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1514", "title": "INT8 Quantization Bias on Coral Edge TPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hardware-specific compilation requirement of this platform is the most likely root cause of this newly introduced disparity, and how do you verify it?", "chain_ids": ["edge-chain-auto-secondary-009-28"], "chain_positions": {"edge-chain-auto-secondary-009-28": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1515", "title": "Edge TPU Operator Fallback Diagnosis", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the hardware-level root cause of this symptom, and how do you confirm it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1516", "title": "Diagnosing Distillation Latency Spikes on Edge Accelerators", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing these latency spikes and how would you diagnose the bottleneck?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1517", "title": "Dataflow Starvation and Watchdog Triggers", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What architectural issue makes inference miss deadlines when the host writes diagnostic logs to NVMe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1518", "title": "Diagnosing Host Bottlenecks with Dataflow Accelerators", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What system-level bottleneck causes 30 FPS throughput and 100% host CPU despite the Hailo-8's 500 FPS standalone benchmark?", "chain_ids": ["edge-chain-auto-secondary-008-13"], "chain_positions": {"edge-chain-auto-secondary-008-13": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1519", "title": "Edge TPU Activation Memory Spilling and Partitioning", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the INT8 Coral Edge TPU model get partitioned into subgraphs with hundreds of milliseconds of latency?", "chain_ids": ["edge-chain-auto-secondary-003-20"], "chain_positions": {"edge-chain-auto-secondary-003-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1520", "title": "Diagnosing Data Starvation on Jetson Orin", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing this throughput bottleneck, and how would you diagnose the root cause within the data ingestion pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1521", "title": "Diagnosing Hotspots in Edge Consistent Hashing", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does one node throttle and OOM while all three nodes receive the same number of camera streams?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1522", "title": "Diagnosing Thermal Throttling on NVIDIA Jetson Orin", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this sudden performance drop, and how do you diagnose it?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1523", "title": "Shadow vs Canary Deployment on Host-Dependent Accelerators", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which rollout strategy (timesliced shadow or fleet-subset canary) do you recommend for this specific hardware, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1524", "title": "Batching on Google Coral Edge TPU", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use static batching or dynamic batching for 4 Coral TPU cameras at 30 FPS to minimize p99 latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1525", "title": "Dataflow Accelerator CNN Architecture Evaluation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Since the Hailo-8 has no local DRAM and relies on streaming from host memory, which architecture do you choose and how do you evaluate their actual on-device performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1526", "title": "Optimizing RAG Pipeline Latency on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture do you choose, and how do you allocate resources to ensure predictable latency?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1527", "title": "Evaluating Cost and Latency Trade-offs on Google Coral Edge TPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the robots use cloud GPU inference over 5G or local Edge TPU inference for 10 FPS 24/7 object detection, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1528", "title": "Edge Data Quality Gating on Jetson Orin", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which data quality gate is better on Jetson Orin: a GPU autoencoder or ISP heuristics plus a tiny DLA INT8 classifier?", "chain_ids": ["edge-chain-auto-secondary-009-24"], "chain_positions": {"edge-chain-auto-secondary-009-24": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1529", "title": "Active Learning Strategies for On-Premise Defect Detection", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which active learning strategy do you select, and how do you justify the hardware resource tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1530", "title": "Evaluating Drift Detection on On-Premise Accelerators", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which alternative is better and why?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1531", "title": "Evaluating Host-Device DMA Strategies for High-Resolution Video Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate these alternatives and determine which architecture point maximizes overall system throughput.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1532", "title": "Architecture Selection for Coral Edge TPU Translation", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which translation architecture should you deploy on the Coral Edge TPU given full INT8 quantization and limited operator support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1533", "title": "Evaluating Energy Efficiency of Architectures on Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which INT8 model will consume less energy per inference on the 2W Coral Edge TPU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1534", "title": "INT8 Calibration Bias on Dataflow Edge Accelerators", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which design is better for evaluating and maintaining fairness, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1535", "title": "Evaluating Compute Migration to Dataflow Accelerator", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you partition intermediate feature maps to the CPU or compile the entire model onto the accelerator under the 3W budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1536", "title": "Evaluating Fail-Operational Fallbacks on Edge TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy provides better fail-operational reliability during thermal throttling?", "chain_ids": ["edge-chain-auto-secondary-011-05"], "chain_positions": {"edge-chain-auto-secondary-011-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1537", "title": "Distillation vs Pruning on Cloud AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which compression strategy is better: 80% unstructured pruning + INT8, or Knowledge distillation to a dense 15B parameter student + INT8, and why?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1538", "title": "Load Balancing Strategies for Stateful Edge Inference", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Given the hardware constraints of the Cloud AI 100, which approach is better and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1539", "title": "Zero-Copy Memory Mapping for Multi-Stream Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory design should you use for four processes sharing the same 8 GB model on the 32 GB embedded device, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1540", "title": "Edge TPU Model Swapping and Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which memory management strategy better prevents OOM crashes and memory fragmentation for the accelerator tensor I/O?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1541", "title": "Evaluating Mixed-Precision Inference on Qualcomm AI 100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach is better suited for this specific hardware, and what bottlenecks dictate this decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1542", "title": "Edge TPU Operator Delegation Tradeoffs", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which alternative do you choose to stay within your power and latency budgets?", "chain_ids": ["edge-chain-auto-secondary-006-12"], "chain_positions": {"edge-chain-auto-secondary-006-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1543", "title": "GPU vs DLA Parallel Scheduling Tradeoffs", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which schedule is better for minimizing end-to-end latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1544", "title": "Bottleneck Analysis on Qualcomm Cloud AI 100", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which proposal is better for reducing the 45 ms frame latency to the 30 ms SLA on the Cloud AI 100, and why?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 3}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1545", "title": "Evaluating Real-Time Safety Guardrails on Jetson Orin", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach provides better organizational accountability and system performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1546", "title": "Edge Storage Format for Hailo-8", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which storage format should you use to keep the Hailo-8 fed from 100 MB/s eMMC, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1547", "title": "Edge TPU Audio Event Ingestion Architecture", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which streaming ingestion architecture is better for the edge anomaly detector, and why?", "chain_ids": ["edge-chain-auto-secondary-014-13"], "chain_positions": {"edge-chain-auto-secondary-014-13": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1548", "title": "Dataflow Tradeoffs on Qualcomm AI 100", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the LLM projection matrices use weight-stationary or output-stationary dataflow, and how should you tile them?", "chain_ids": ["edge-chain-auto-024-09"], "chain_positions": {"edge-chain-auto-024-09": 2}, "chain_tiers": {"edge-chain-auto-024-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1549", "title": "Sustained Throughput vs Thermal Throttling on Jetson Orin", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy yields higher sustained 24-hour throughput in the 40\\u00b0C sealed enclosure, and why?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1550", "title": "Evaluating Edge Coreset Selection for On-Device Adaptation", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy is better suited for this edge environment and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1551", "title": "Evaluating 4-bit AWQ versus INT8 on Coral Edge TPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt 4-bit AWQ or standard INT8 QAT for the Coral Edge TPU drone model, and why?", "chain_ids": ["edge-chain-auto-secondary-006-04"], "chain_positions": {"edge-chain-auto-secondary-006-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1552", "title": "Edge TPU Operator Fusion Tradeoff", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you keep Swish on the host CPU or replace it with ReLU6 for full INT8 TPU execution, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1553", "title": "ViT vs CNN Ensemble Memory Tradeoffs on Jetson Orin", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture is better suited for the Jetson Orin: the FP16 ViT-L or the INT8 EfficientNet-B7 ensemble, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1554", "title": "Optimizing Orin Pipeline Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture is more likely to meet the 33 ms latency target, and what is the critical bottleneck in each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1555", "title": "Dual-Model Memory Management on Edge TPU", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "With 8 MB Coral SRAM, should you swap the 4 MB model over USB or spill 2 MB activations, and which has lower latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1556", "title": "Evaluating OTA Strategies for Qualcomm Cloud AI 100", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which OTA update strategy is better suited for this 50 Mbps constrained environment, and how is rollback handled?", "chain_ids": ["edge-chain-auto-027-18"], "chain_positions": {"edge-chain-auto-027-18": 1}, "chain_tiers": {"edge-chain-auto-027-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1557", "title": "Shadow Deployment Resource Budgeting on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much RAM and power remain for the OS and robotic control stack during the Jetson Orin shadow deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1558", "title": "Adversarial Purification on Hailo-8", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum FPS and energy per frame result from running the 40 GOPS detector plus 12 GOPS purifier on the 2.5W Hailo-8?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1559", "title": "Hailo-8 Static Batch Latency Calculation", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum batch size under the 10 ms latency budget at 50% Hailo-8 utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1560", "title": "Depthwise Separable Convolution on Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many MACs does the standard 3x3 convolution use versus depthwise separable convolution, and what is the reduction factor?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1561", "title": "RAG Concurrency and Power on Cloud AI 100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many concurrent RAG sessions can the 32 GB Cloud AI 100 support, and what is the power budget per session at 75W?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1562", "title": "Hailo-8 INT8 FPS and Energy Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and energy per frame for the 52 GOPs detector on the 2.5W Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1563", "title": "On-Device Coreset Capacity Calculation", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If each embedding is a 4096-dimensional FP16 vector, how many coreset embeddings fit in the remaining Cloud AI 100 memory?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1564", "title": "Data Anomaly Detection Throughput on Cloud AI 100", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum theoretical validation throughput for a 2-million-operation INT8 model on the 400 TOPS Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1565", "title": "Active Learning Inference Time on Coral TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long will one Edge TPU take to process 2,000,000 images at 50% of its 4 TOPS peak if each image costs 8 GOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1566", "title": "Edge TPU Compute Capacity and PSI Drift Calculation", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What compute capacity remains after adding the 8 GOPS/frame autoencoder at 50 FPS, and what is the defect-bin PSI contribution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1567", "title": "Edge TPU USB Bandwidth Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the host-to-device transfer time for one 224x224x3 INT8 image over USB 2.0, and is the system compute-bound or I/O-bound?", "chain_ids": ["edge-chain-auto-secondary-008-17"], "chain_positions": {"edge-chain-auto-secondary-008-17": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1568", "title": "Encoder vs Decoder on DRAM-less Hailo-8", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical minimum latencies for encoder-only versus decoder-only processing of 128 tokens on the DRAM-less Hailo-8?", "chain_ids": ["edge-chain-auto-secondary-011-26"], "chain_positions": {"edge-chain-auto-secondary-011-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1569", "title": "Hailo-8 INT8 Energy Per Operation", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical energy per INT8 operation for the 26 TOPS, 2.5W Hailo-8 before host memory costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1570", "title": "On-Device Demographic Parity for Pedestrian Intention", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Demographic Parity Difference between adults and children, and what is the TOPS overhead of a 0.5% fairness classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1571", "title": "Jetson Orin INT8 Compute and Energy Budgeting", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 50 FPS, what is the per-frame INT8 operation budget and energy per frame for a 275 TOPS, 60W Jetson Orin?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1572", "title": "Power-Constrained QoS Shedding on Hailo-8", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At a 1.0W cap, what degraded compute capacity and primary-detector FPS can the Hailo-8 sustain after shedding the secondary model?", "chain_ids": ["edge-chain-auto-secondary-011-06"], "chain_positions": {"edge-chain-auto-secondary-011-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1573", "title": "Knowledge Distillation to INT8 for Google Coral", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What max FPS and energy per inference do you get, and what INT8-aware distillation step is needed for Coral?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1574", "title": "Load Balancing Inference Across Multiple Coral TPUs", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With 12,000 requests/s across 4 Coral TPUs, what compute load per TPU is required and can the cluster sustain it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1575", "title": "Memory-Mapped Concurrent Edge Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 4 processes on 32 GB Cloud AI 100, how much memory per process remains for KV cache and activations with independent vs mmap-shared INT8 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1576", "title": "Host Memory Allocation and DMA Overhead for Hailo-8", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total host memory footprint is used by the 5-frame Hailo-8 ring buffer plus 4KB scatter-gather DMA metadata if the OS requires a 16-byte tracking descriptor for every page?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1577", "title": "Google Coral Edge TPU INT8 Throughput", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What theoretical max FPS and FPS/W do you get on the 4 TOPS, 2W Coral Edge TPU for a 300M-MAC INT8 frame?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1578", "title": "Parallel Operator Scheduling and Energy on AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At 80% utilization on a 400 TOPS, 75W accelerator, what are the execution time and energy for an 80 TOP INT8 block?", "chain_ids": ["edge-chain-auto-secondary-008-29"], "chain_positions": {"edge-chain-auto-secondary-008-29": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1579", "title": "USB I/O Bottleneck Analysis on Google Coral Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using 400 MB/s effective USB 3.0 bandwidth, how do compute latency and 3MB input transfer time compare for the 2 GOPS model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1580", "title": "Ampere 2:4 Structured Sparsity on Jetson Orin", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Will applying Ampere 2:4 structured sparsity allow the 180 TOPS, 20 GB INT8 model to meet the Orin's limits, and what is the sparse weight size?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 1}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1581", "title": "Edge ESG Metrics and Energy Calculation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 50% utilization on a 400 TOPS, 75W accelerator, what latency and energy per 20 TOP moderation request should you report?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1582", "title": "Edge Data Buffering and Parquet Compression", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 150 MB/s telemetry buffered for 1 minute, what are the uncompressed buffer size, compressed Parquet row group size, and 2-hour storage need?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1583", "title": "Hailo-8 Host Streaming Bandwidth and Throughput", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 65 GOPS/frame model with 5 MB inputs, what is the theoretical maximum frame rate and minimum sustained host memory bandwidth required to avoid stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1584", "title": "Edge TPU Systolic Array Throughput Calculation", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "On a 4 TOPS, 2W Coral Edge TPU, what max FPS and energy per inference result from a 10 GOPS convolution layer?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 0}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1585", "title": "Calculate Cooling Requirements for Cloud AI 100 Deployment", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total thermal dissipation requirement in British Thermal Units per hour (BTU/hr) for the 8-card Cloud AI 100 accelerator payload at 75W per card to ensure sustained performance without thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1586", "title": "Estimating Maximum Throughput on Qualcomm Cloud AI 100", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For an 8 GOPS INT8 ResNet-50 on the 400 TOPS, 75W Cloud AI 100, what are the compute-bound max FPS and inferences per watt?", "chain_ids": ["edge-chain-auto-secondary-008-14"], "chain_positions": {"edge-chain-auto-secondary-008-14": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1587", "title": "TensorRT INT8 Graph Optimization on Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the theoretical maximum throughput (in FPS) for the compiled graph and the energy consumed per frame assuming the system operates at its peak 60W TDP.", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1588", "title": "Hailo-8 Host Memory Bandwidth Calculation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What host memory bandwidth is needed to keep Hailo-8 at 26 TOPS for a 4 GOPs/frame model streaming 15 MB per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1589", "title": "Edge TPU CI/CD Pipeline Quantization Throughput", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical FPS should the CI/CD smoke test show for an INT8 model requiring 200 GOPS per inference on a 4 TOPS accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1590", "title": "Hailo-8 ONNX Conversion and Graph Break Bandwidth", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With a 13 GOPS Hailo-8 model and a 1 MB out/1 MB back CPU fallback, what are the max FPS and fallback host bandwidth?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1591", "title": "Memory Footprint Estimation for Qualcomm Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does the INT8 13B model fit in 32 GB on Cloud AI 100 after reserving 4 GB for KV cache and 2 GB for workspace, and how much remains?", "chain_ids": ["edge-chain-auto-secondary-003-30"], "chain_positions": {"edge-chain-auto-secondary-003-30": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1592", "title": "Edge Telemetry Buffering on NVIDIA Jetson Orin", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 10 Hz with 2 KB snapshots, how much telemetry accumulates per day and what percent of 32 GB RAM would a 7-day buffer consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1593", "title": "Watchdog Threshold for Deterministic Execution", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What watchdog threshold should be set if the 1,200 GOPs model uses only 10% of Cloud AI 100 peak (400 TOPS) and the timer must be 20% above WCET?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1594", "title": "Transformer Attention Cost on Coral Edge TPU", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For N=256 and d=512 on a 4 TOPS Coral Edge TPU, how many INT8 ops does QK^T require and what is the minimum latency?", "chain_ids": ["edge-chain-auto-017-12"], "chain_positions": {"edge-chain-auto-017-12": 0}, "chain_tiers": {"edge-chain-auto-017-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1595", "title": "Hailo-8 4-Bit Streaming Bandwidth", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What sustained host-to-accelerator bandwidth is required to stream a 1.5B-parameter 4-bit AWQ model once per token at 20 tokens/s?", "chain_ids": ["edge-chain-auto-secondary-006-02"], "chain_positions": {"edge-chain-auto-secondary-006-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1596", "title": "Dataflow Graph Splitting Overhead", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming the Conv2D is 52 GOPS, what latency penalty does the 1920x1080x16 INT8 round trip add over a 2 GB/s PCIe link?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 0}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1597", "title": "Cloud AI 100 Latency Decomposition", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the TTFT and total end-to-end latency for the Cloud AI 100 service with 2 TOP prefill, 50 tokens at 15 ms TPOT, and the given overheads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1598", "title": "Shadow Deployment Bandwidth on Hailo-8", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 30 FPS shadow mode on Hailo-8, what host memory bandwidth is needed for two models each moving 6 MB input, 4 MB weights, and 2 MB outputs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1599", "title": "Calculate Efficiency for GPU vs DLA on Orin", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the DLA and GPU TOPS/W efficiencies, and what max FPS can the DLA deliver for a 2.5 TOPS/frame model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1600", "title": "Hailo-8 Activation Spilling Bandwidth Calculation", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For a 512x512x64 INT8 activation spilled and restored at 30 FPS on Hailo-8, what sustained host memory bandwidth is required?", "chain_ids": ["edge-chain-auto-secondary-003-19"], "chain_positions": {"edge-chain-auto-secondary-003-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1601", "title": "Edge TPU Randomized Smoothing Throughput Calculation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With randomized smoothing requiring N=10 inferences per frame, what effective FPS and FPS/W can a 4 TOPS, 2W Coral TPU deliver for a 50 GOPS model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1602", "title": "Hailo-8 MobileNetV2 Block Compute and Memory", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the MobileNetV2 block on Hailo-8, what are the total INT8 ops and minimum host transfer bytes when intermediates stay on-chip?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1603", "title": "Agent Orchestration Memory Footprint on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given 32 GB RAM, a 7B INT8 LLM, 2 GB ViT, 1 GB Whisper, 4 GB OS, and 1 MB/token KV cache, what max context window fits?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1604", "title": "Edge TPU Inference Throughput and Energy Calculation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical max FPS and energy per inference in mJ does the 4 TOPS, 2W accelerator achieve for a 2 GOPS INT8 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1605", "title": "Data Pruning Impact on Edge Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much training compute time per epoch is saved by using a 10% coreset instead of the full 100,000-image dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1606", "title": "Data Bottleneck on Qualcomm AI 100", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 6 MB frames over 12 GB/s PCIe and a 100 GOPS model on a 400 TOPS accelerator, what is the max FPS and the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1607", "title": "Edge Data Quality Gate Compute Utilization", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What percentage of Jetson Orin's 275 TOPS INT8 budget is consumed by a 10 GOPs/frame quality gate on 4 cameras at 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-009-24"], "chain_positions": {"edge-chain-auto-secondary-009-24": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1608", "title": "Calculate On-Device PSI for Edge Accelerator", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact PSI for Reference=[0.5,0.3,0.2] and Serving=[0.4,0.4,0.2] using the standard binned formula?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1609", "title": "DMA Overhead vs Compute on Cloud AI 100", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming no pipelining between transfer and compute, what are the DMA transfer time, compute time, and their ratio for the 16 uncompressed 4K frames?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1610", "title": "Edge TPU Encoder-Decoder Throughput Calculation", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum inferences per second can a 4 TOPS NPU achieve when each seq2seq request needs a 15 GOPS encoder plus 20x2 GOPS decoder?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1611", "title": "Calculate Inference Energy on Google Coral Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical energy per inference in millijoules (mJ) and the maximum theoretical frames per second (FPS)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1612", "title": "Runtime Memory for 4-bit Weights on Edge TPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact runtime weight memory footprint required when the 10M-parameter 4-bit model is unpacked to INT8 for Coral Edge TPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1613", "title": "On-Device Fairness Audit Calculation for Hailo-8", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the exact Demographic Parity Difference and theoretical minimum audit processing time on the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1614", "title": "Hailo-8 Dataflow Streaming Bandwidth and Throughput Calculation", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical maximum FPS and minimum host-memory read bandwidth are needed for the 13 GOPS/frame, 1 MB input model on Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1615", "title": "Edge TPU Fallback Model Compute Budget", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum fallback-model compute budget in GOPS per frame to sustain 20 FPS after throttling to 1 TOPS?", "chain_ids": ["edge-chain-auto-secondary-011-05"], "chain_positions": {"edge-chain-auto-secondary-011-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1616", "title": "Hailo-8 Graph Compilation and Power Efficiency Calculation", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the absolute FPS/Watt improvement from reducing the graph to 13 GOPS/frame and raising MAC utilization to 80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1617", "title": "Distillation Projection Footprint on AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the exact INT8 memory footprint of the 1024-to-4096 projection layer in bytes (including bias), and what fraction of the 32 GB memory does it consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1618", "title": "Hailo-8 End-to-End Latency Decomposition for Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total end-to-end latency per frame assuming a 26 TOPS peak accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1619", "title": "Weighted Round-Robin Routing for Heterogeneous Edge Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What minimal integer weighted round-robin weights should each accelerator get, and what total request capacity supports 0.5 TOPS/request?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1620", "title": "Calculate memory bound latency for INT8 inference on Cloud AI 100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum batch-1 single-token generation latency for the 8B INT8 model at 136 GB/s memory bandwidth?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1621", "title": "Calculate Mmap Cold Start Latency on Jetson Orin", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the absolute minimum cold-start time to page the 15 GB INT8 model from 2.5 GB/s NVMe into Jetson memory?", "chain_ids": ["edge-chain-auto-secondary-008-25"], "chain_positions": {"edge-chain-auto-secondary-008-25": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1622", "title": "Coral Edge TPU Activation Memory Sizing", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum safe batch size given 8 MB SRAM, 4.8 MB weights, 0.2 MB driver reserve, and 750 KB activations per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1623", "title": "Calculate Precision Throughput Delta on Cloud AI 100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the absolute difference in throughput (FPS) when quantizing the model from FP16 to INT8.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1624", "title": "CI/CD Performance Gating for Qualcomm Cloud AI 100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What IPS threshold should the CI/CD gate use for the 500 GOPS/inference INT8 model at 75% Cloud AI 100 utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1625", "title": "CPU Fallback Bottleneck on Edge TPU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical maximum FPS results when 20% of the 10 GOPS model falls back to a 100 GOPS CPU and execution is sequential?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1626", "title": "Jetson Orin LLM Memory Footprint", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory is required for the 15B INT8 weights, and how much remains for KV cache and activations after the 8 GB reserve on a 32 GB system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1627", "title": "Hailo-8 Host Memory Bandwidth Alerting", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What continuous PCIe bandwidth is needed for 60 FPS 1080p RGB input streaming, and what baseline utilization is that of the 500 MB/s limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1628", "title": "Hardware-Aware NAS Compute Constraint Calculation", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the absolute maximum number of operations (in GOps) a candidate architecture can have per frame to meet the 10,000 FPS constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1629", "title": "Heterogeneous Parallel Branch Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the theoretical compute latencies if Branch A and Branch B run sequentially versus concurrently on the DLA and GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1630", "title": "Compute-Bound Latency on Cloud AI 100", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected execution latency in milliseconds for the 200 GOPs projection layer at 50% of 400 TOPS INT8 peak?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1631", "title": "Calculating Streaming Bandwidth for Pruned Hailo-8 Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What host memory bandwidth is required to stream weights after uniform 50% structured channel pruning at 50 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1632", "title": "Watchdog Timeout Calculation Under Contention", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum allowable base GPU inference time so the 50ms watchdog is never tripped under contention and jitter?", "chain_ids": ["edge-chain-auto-secondary-008-02"], "chain_positions": {"edge-chain-auto-secondary-008-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1633", "title": "Calculate Edge TPU Ingestion Throughput for Video Stream", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming the accelerator operates at 4 TOPS (INT8) and has a 2W power envelope, what maximum theoretical FPS can it process, and will USB 3.0 bottleneck the raw 1080p RGB 30 FPS stream?", "chain_ids": ["edge-chain-auto-secondary-014-13"], "chain_positions": {"edge-chain-auto-secondary-014-13": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1634", "title": "Dataflow Latency and Energy on AI 100", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the expected execution latency in milliseconds and the energy consumed in millijoules for this single layer?", "chain_ids": ["edge-chain-auto-024-09"], "chain_positions": {"edge-chain-auto-024-09": 0}, "chain_tiers": {"edge-chain-auto-024-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1635", "title": "Calculate Sustained FPS Under Enclosure Thermal Limits", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum sustained FPS can the Jetson Orin run indefinitely when limited to 30W and each inference costs 2 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1636", "title": "USB Latency Overhead in Edge TPU Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total latency per frame if the 4 cameras are processed sequentially (batch size 1) versus batched together over USB 2.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1637", "title": "Host Round-Trip Latency in Edge TPU Operator Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much USB communication time per inference is saved by fusing the 4 MB fallback activation so it stays on the Edge TPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1638", "title": "Calculate Maximum Context Length on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given the model has 32 layers, 32 KV heads, and a head dimension of 128, what is the maximum batch-1 context length (in tokens) assuming an FP16 KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1639", "title": "Shadow Deployment Arbitration on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design an on-device shadow deployment architecture that safely runs both models concurrently without violating constraints?", "chain_ids": ["edge-chain-auto-secondary-011-02"], "chain_positions": {"edge-chain-auto-secondary-011-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1640", "title": "On-Device LoRA with Heterogeneous Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect LoRA fine-tuning on Orin to fit the 8B INT8 transformer, 45 GB naive activations, 32 GB RAM, and 60W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1641", "title": "Dataflow Adversarial Defense on Hailo-8 Edge Cameras", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What Hailo-8-friendly adversarial patch defense would maintain 30 FPS without creating host-memory bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1642", "title": "Dataflow-Aware Dynamic Batching for Multi-Camera Hailo-8 Edge Appliances", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you dynamically batch 4 asynchronous 1080p 30 FPS camera streams on a DRAM-less Hailo-8 while meeting the 33 ms target?", "chain_ids": ["edge-chain-auto-secondary-005-10"], "chain_positions": {"edge-chain-auto-secondary-005-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1643", "title": "Heterogeneous CNN Design for Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What CNN architecture would you design to map 4K 60 FPS perception across Orin's GPU and DLA while maximizing INT8 throughput within 60W?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 4}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1644", "title": "On-Premise Agent Orchestration Pipeline", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you serve the router, retriever, and 8B LLM on one Cloud AI 100 to handle 20 RPS within 1 second, and what bottlenecks dominate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1645", "title": "Multi-Model Dataflow Compute and PCIe Bandwidth Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you fuse the detector, depth estimator, and segmenter on Hailo-8, and what host bandwidth and compute utilization would result?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 4}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1646", "title": "On-Premise Coreset Selection Pipeline", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a concurrent coreset selection pipeline that prunes 1,000 FPS video to 10 FPS for cloud retraining without breaking real-time SLAs?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1647", "title": "Quantization-Aware Active Learning Pipeline for Edge TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you curate and annotate data to fix Edge TPU INT8 quantization failures on rare defects, and what mining throughput is required?", "chain_ids": ["edge-chain-auto-secondary-003-23"], "chain_positions": {"edge-chain-auto-secondary-003-23": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1648", "title": "On-Device Distribution Drift Detection for Edge TPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you detect statistical data drift on-device using the Coral Edge TPU without streaming raw images or disrupting INT8 inference?", "chain_ids": ["edge-chain-auto-secondary-009-27"], "chain_positions": {"edge-chain-auto-secondary-009-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1649", "title": "Bypassing PCIe Bottlenecks on Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What memory and data-movement architecture would make 4 HD 30 FPS camera streams real-time on a PCIe Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1650", "title": "Dataflow Optimization for Sequenced Output", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which sequence model architecture and host-device streaming strategy would maximize throughput on the DRAM-less dataflow accelerator?", "chain_ids": ["edge-chain-auto-secondary-011-26"], "chain_positions": {"edge-chain-auto-secondary-011-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1651", "title": "Dataflow Optimization for Hailo-8 Stream Processing", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design an object detection and tracking pipeline on Hailo-8 to minimize energy per inference by reducing host memory access?", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1652", "title": "Sub-INT8 Quantization on Dataflow Accelerators", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a mixed 4-bit/INT8 Hailo-8 quantization scheme to cut host bandwidth for a larger transformer and recover accuracy?", "chain_ids": ["edge-chain-auto-secondary-006-02"], "chain_positions": {"edge-chain-auto-secondary-006-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1653", "title": "On-Device Intersectional Fairness Auditing at the Edge", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a continuous, on-device fairness audit for 16 subgroups without exceeding the 60W TDP or disrupting the primary 30 FPS inference pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1654", "title": "Hybrid On-Device FL using Hailo-8 Dataflow Accelerator", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you use the INT8 inference-only accelerator to run local federated training without stalling the host CPU?", "chain_ids": ["edge-chain-auto-017-02"], "chain_positions": {"edge-chain-auto-017-02": 2}, "chain_tiers": {"edge-chain-auto-017-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1655", "title": "Heterogeneous Pipeline Design for Jetson Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition a massive transformer perception model across the Ampere GPU and DLA, and design kernels to avoid LPDDR5 bottlenecks?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1656", "title": "Heterogeneous Graph Compilation on Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What graph partitioning and operator lowering strategy would you use to hit 120 FPS within the Jetson Orin's 60W TDP?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 4}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1657", "title": "Cross-Architecture Distillation for Google Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you distill the ViT into an Edge TPU-compatible INT8 CNN that runs fully on the TPU without accuracy loss?", "chain_ids": ["edge-chain-auto-secondary-008-19"], "chain_positions": {"edge-chain-auto-secondary-008-19": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1658", "title": "Edge TPU Cluster Load Balancing for Real-Time Video Analytics", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you route and load-balance 8 camera streams across 4 Coral Edge TPUs while handling bursts and frame drops?", "chain_ids": ["edge-chain-auto-secondary-006-07"], "chain_positions": {"edge-chain-auto-secondary-006-07": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1659", "title": "Dataflow Streaming on Host-Dependent Accelerators", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the Hailo-8 memory hierarchy and streaming strategy for 4 1080p30 streams without stalling PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1660", "title": "Zero-Copy Model Switching via Mmap", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you serve 15 unpredictable 4 GB models across 8 parallel workers without OOMs or long cold starts?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1661", "title": "Zero-Copy Host Streaming Architecture for Hailo-8 Memory", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign host memory management to stop sporadic OOM kills while preserving zero-copy Hailo-8 streaming?", "chain_ids": ["edge-chain-auto-secondary-011-08"], "chain_positions": {"edge-chain-auto-secondary-011-08": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1662", "title": "Edge TPU QAT and Mixed-Precision Pipeline Design", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a BF16/FP32 QAT pipeline that simulates strict INT8 Edge TPU execution and avoids accuracy loss?", "chain_ids": ["edge-chain-auto-secondary-011-28"], "chain_positions": {"edge-chain-auto-secondary-011-28": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1663", "title": "Hailo-8 Dataflow Compilation with Unsupported Ops", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you convert and deploy the unsupported activation and attention operators without breaking Hailo-8 dataflow efficiency?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1664", "title": "Multi-Tenant LLM Serving on Cloud AI 100", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you serve the 8B chat model and 14B summarization model concurrently on one Cloud AI 100 while meeting chat SLAs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1665", "title": "LLM Inference Sizing for Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What quantization and memory-management plan makes a 14B LLM with 2048-token contexts and 16 users feasible on one 32 GB Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-003-30"], "chain_positions": {"edge-chain-auto-secondary-003-30": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1666", "title": "Autonomous Fleet Telemetry on Jetson Orin", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design telemetry for Jetson Orin robots that can buffer 12 hours offline without starving the perception stack or memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1667", "title": "Hardware-Aware NAS for Edge TPU Object Detection", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS framework to discover the optimal network topology for this specific accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1668", "title": "Optimal LLM Operator Scheduling on Qualcomm Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you schedule and fuse operators for a 15B INT8 LLM on Cloud AI 100 to overlap attention memory traffic with dense compute?", "chain_ids": ["edge-chain-auto-secondary-008-29"], "chain_positions": {"edge-chain-auto-secondary-008-29": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1669", "title": "Dynamic Thermal Throttling on Jetson Orin", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition workloads and use DVFS P-states to keep the perception system under a 40W sustained power cap?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 4}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1670", "title": "Edge TPU Pipeline Profiling and Bottleneck Resolution", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you eliminate the Coral Edge TPU CPU fallback and redesign the pipeline to meet the 33ms frame target?", "chain_ids": ["edge-chain-auto-secondary-010-02"], "chain_positions": {"edge-chain-auto-secondary-010-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1671", "title": "Mixed-Precision LLM Architecture for Cloud AI 100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What mixed-precision quantization architecture would fit a 35B MoE LLM into 32 GB while using the Cloud AI 100's INT8 engines?", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 3}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1672", "title": "On-Premise Guardrail Architecture for High-Throughput Edge Inference", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you co-locate a 14B summarization LLM with PII and toxicity guardrails under a 150ms on-prem latency SLA?", "chain_ids": ["edge-chain-auto-secondary-010-05"], "chain_positions": {"edge-chain-auto-secondary-010-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1673", "title": "Architecting a Streaming Pipeline for Dataflow Roofline Optimization", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the Hailo-8 host streaming path, analyze the roofline, and restructure YOLOX for 4-camera 1080p60 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1674", "title": "Architecting ISO-26262 Compliant Vision on Coral Edge TPU", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an architecture that provides deterministic execution, continuous self-testing, and fail-safe handling without violating the 50ms ASIL-B latency?", "chain_ids": ["edge-chain-auto-secondary-008-01"], "chain_positions": {"edge-chain-auto-secondary-008-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1675", "title": "Multi-modal Sensor Data Ingestion Architecture on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a novel hybrid storage format and tiering strategy to serialize and persist this heterogeneous data without stalling the inference pipeline or exceeding 60W?", "chain_ids": ["edge-chain-auto-secondary-008-31"], "chain_positions": {"edge-chain-auto-secondary-008-31": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1676", "title": "Edge TPU Systolic Dataflow for Depthwise Convolutions", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign dataflow, tiling, and operators so the drone tracker fully uses the TPU without CPU fallbacks?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 3}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1677", "title": "Architecting Thermal Resilient Video Analytics", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule 30 HD camera streams across two Cloud AI 100 cards to meet latency SLAs under 45°C thermal throttling?", "chain_ids": ["edge-chain-auto-secondary-006-17"], "chain_positions": {"edge-chain-auto-secondary-006-17": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1678", "title": "Hailo-8 Streaming KV-Cache Architecture for Long-Context Transformers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign attention dataflow for a 1B VLM on Hailo-8 to avoid streaming a 4K-token KV cache every generation step?", "chain_ids": ["edge-chain-auto-017-11"], "chain_positions": {"edge-chain-auto-017-11": 2}, "chain_tiers": {"edge-chain-auto-017-11": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1679", "title": "Unified Memory Multimodal Perceptor", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition Jetson Orin's 32 GB unified memory for a 13B LLM, 2B vision encoder, and 4000-token history without OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1680", "title": "Zero-Copy Multi-Camera Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you build a zero-copy ISP-to-GPU/DLA pipeline for 8 concurrent 4K30 streams on Jetson Orin without starving inference?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 5}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1681", "title": "CI/CD Pipeline for Fleet-Wide Coral Edge TPU Deployment", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a CI/CD pipeline for 10,000 edge cameras to quantize, verify operator mapping, and safely roll out models?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 3}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1682", "title": "Dual-Bank OTA Architecture for Coral Edge TPU Ensembles", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a zero-downtime A/B OTA scheme for 5,000 Coral devices that atomically updates the app and all three models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1683", "title": "Canary Rollout Context Thrashing on AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the structural bottleneck of this traffic splitting approach on the Cloud AI 100, and how would you quantify a fix?", "chain_ids": ["edge-chain-auto-secondary-011-03"], "chain_positions": {"edge-chain-auto-secondary-011-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1684", "title": "On-Device LoRA Gradient Checkpointing Tradeoffs", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose this bottleneck and quantify the compute-memory tradeoff of applying gradient checkpointing to fit the workload within the hardware constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1685", "title": "Optimizing Adversarial Purification on Jetson Orin", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the hardware bottleneck and quantify the performance gain of a deployment optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1686", "title": "Dynamic Batching Latency Optimization on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 80ms tail latency from dynamic batching, and what batching strategy would meet the 33.3ms deadline?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1687", "title": "Optimizing CNNs for Qualcomm Cloud AI 100 Bottlenecks", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you redesign the architecture to alleviate this memory bandwidth bottleneck, and how do you quantify the reduction in memory traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1688", "title": "Edge TPU Multi-Model Pipeline Fallback Optimization", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the Coral Edge TPU OCR bottleneck to ensure the detector-plus-OCR pipeline reaches 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1689", "title": "Optimizing Transformer Inference on Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would INT8 quantization and DLA utilization change the compute, memory, and power profile of the 2.5B tracking transformer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1690", "title": "Edge TPU INT8 Calibration Coreset Optimization", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically optimize the calibration data selection to diagnose and fix this quantization bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1691", "title": "Optimizing Data Validation Pipelines for Coral Edge TPU", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you move data validation and anomaly detection from the host CPU to the Coral Edge TPU and quantify the throughput gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1692", "title": "Optimizing Edge Active Learning Curation", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you reduce Hailo-8 active learning storage and host bandwidth by selecting only semantically novel frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1693", "title": "Optimizing Drift Detection on Hailo-8 Dataflow Accelerator", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose this bottleneck and optimize the drift detection strategy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1694", "title": "Hailo-8 Zero-Copy Stream Optimization", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What host-device data movement bottleneck stalls the pipeline, and how would zero-copy DMA fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1695", "title": "Optimizing Encoder-Decoder Latency on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the compute and memory bottlenecks for the decoder, and what optimizations would you apply to resolve this bottleneck?", "chain_ids": ["edge-chain-auto-secondary-011-27"], "chain_positions": {"edge-chain-auto-secondary-011-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1696", "title": "Optimizing Intersectional Fairness Evaluation", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this bottleneck and optimize the pipeline to ensure fairness metrics are calculated efficiently without exceeding the power budget or memory limits?", "chain_ids": ["edge-chain-auto-secondary-009-30"], "chain_positions": {"edge-chain-auto-secondary-009-30": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1697", "title": "Optimizing INT8 Compute Utilization on Qualcomm AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you quantify the memory bottleneck behind 5% Cloud AI 100 utilization and improve it with fusion, coalescing, and batching?", "chain_ids": ["edge-chain-auto-secondary-007-23"], "chain_positions": {"edge-chain-auto-secondary-007-23": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-23": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1698", "title": "Thermal Throttling Degradation on Jetson Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What degradation ladder would you use after a 40% GPU clock throttle to keep the perception pipeline at a safe 20 FPS?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1699", "title": "Operator Fusion Bottlenecks on Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this compiler bottleneck and quantify the impact of operator lowering and fusion?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1700", "title": "Feature Distillation I/O Bottleneck", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bottleneck caused by the 16MB intermediate activation on Hailo-8, and what distillation architecture change would remove it?", "chain_ids": ["edge-chain-auto-secondary-008-21"], "chain_positions": {"edge-chain-auto-secondary-008-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1701", "title": "Google Coral USB Latency Pipeline Optimization", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the hidden latency in the 85ms Coral Edge TPU pipeline, and how would you optimize it to meet the 33.3ms frame budget?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1702", "title": "Stream Routing and PCIe Bottleneck Analysis", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does round-robin routing stall the 16-camera system, and how much would stream-affinity routing improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1703", "title": "Optimizing Cold Start with Memory-Mapped Inference on Coral TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would memory-mapped loading reduce cold-start latency when multiple processes load the same 20MB Coral Edge TPU model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1704", "title": "Resolving Concurrent OOM on Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the memory bottleneck and quantify a structural fix to prevent OOM without exceeding the 60W TDP limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1705", "title": "Dataflow Pipeline Bottlenecks in Mixed-Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the system-level cost of host FP16 fallback on Hailo-8, and how would you keep the whole model on the INT8 dataflow path?", "chain_ids": ["edge-chain-auto-secondary-011-29"], "chain_positions": {"edge-chain-auto-secondary-011-29": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1706", "title": "TensorRT DLA to GPU Fallback Optimization", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the operator coverage gap causing this fallback and quantify the performance cost?", "chain_ids": ["edge-chain-auto-secondary-006-10"], "chain_positions": {"edge-chain-auto-secondary-006-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1707", "title": "Edge TPU Model Quantization and Footprint Optimization", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose Edge TPU CPU fallbacks in the MobileNetV2 graph and verify the fully INT8 model fits and runs efficiently?", "chain_ids": ["edge-chain-auto-secondary-003-29"], "chain_positions": {"edge-chain-auto-secondary-003-29": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1708", "title": "Optimizing Telemetry Overhead on Qualcomm Cloud AI 100", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze this observability overhead and design an optimized telemetry strategy without losing visibility into stragglers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1709", "title": "Operator Scheduling and Fusion on Edge TPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What scheduling bottleneck is caused by the unsupported custom operator, and how much latency is saved by making the graph fully TPU-compatible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1710", "title": "Optimizing Hailo-8 OTA Updates", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you prevent a 50MB OTA update over 5 Mbps from starving Hailo-8 inference on the shared host memory bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1711", "title": "Hailo-8 Host Streaming Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix the host-side bottleneck keeping the 1080p60 pipeline at 20 FPS?", "chain_ids": ["edge-chain-auto-secondary-010-01"], "chain_positions": {"edge-chain-auto-secondary-010-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1712", "title": "Optimizing Privacy Guardrails on Edge TPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the Coral privacy model's 80ms latency spike, and how would you refactor it to be fully TPU-compatible?", "chain_ids": ["edge-chain-auto-secondary-010-04"], "chain_positions": {"edge-chain-auto-secondary-010-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1713", "title": "Storage I/O Optimization for Qualcomm AI 100", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What data-loading bottleneck is leaving the Cloud AI 100 at 10% utilization, and what storage format would keep it fed?", "chain_ids": ["edge-chain-auto-secondary-008-34"], "chain_positions": {"edge-chain-auto-secondary-008-34": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1714", "title": "Optimizing High-Res Camera Ingestion on Jetson Orin", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What ingestion bottleneck is dropping six 4K camera streams on Jetson Orin, and how would zero-copy NVMM buffers reduce it?", "chain_ids": ["edge-chain-auto-secondary-014-11"], "chain_positions": {"edge-chain-auto-secondary-014-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1715", "title": "Hailo-8 Spatial Tiling and Host Bandwidth", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the host memory bottleneck and quantify the impact of a depth-first dataflow tiling strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1716", "title": "Mitigating Edge TPU Thermal Throttling in Sealed Enclosures", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal bottleneck is causing the device to drop from 30 to 15 FPS, and how would you sustain at least 20 FPS?", "chain_ids": ["edge-chain-auto-secondary-006-16"], "chain_positions": {"edge-chain-auto-secondary-006-16": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1717", "title": "Optimizing KV Cache for Long-Context on Orin", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Diagnose the architectural bottleneck causing this degradation and quantify the optimization impact of using Grouped-Query Attention (GQA) and INT8 KV-cache quantization.", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 1}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1718", "title": "Google Coral Edge TPU Subgraph Optimization", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the Coral Edge TPU model reaching only 10 FPS after quantization, and how would you eliminate CPU fallbacks to hit 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1719", "title": "Mitigating Bandwidth Bottlenecks for 4-bit AWQ on Jetson Orin", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 4-bit AWQ 7B model slow during decoding on Jetson Orin, and what fused-kernel fix would remove the bottleneck?", "chain_ids": ["edge-chain-auto-secondary-006-01"], "chain_positions": {"edge-chain-auto-secondary-006-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1720", "title": "Mitigating Federated Communication Bottlenecks on Jetson Orin", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is stalling federated averaging on Jetson Orin, and how would INT8 plus top-k update compression reduce LTE time and power?", "chain_ids": ["edge-chain-auto-017-01"], "chain_positions": {"edge-chain-auto-017-01": 1}, "chain_tiers": {"edge-chain-auto-017-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1721", "title": "Optimizing ViT Activation Memory Bandwidth on Jetson Orin", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the exact memory hierarchy constraint and quantify the impact of operator fusion to hit the framerate target within the 60W TDP?", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 3}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1722", "title": "Optimizing CI/CD Deployment Pipelines for Hailo-8 Dataflow Architectures", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What deployment bottleneck can drop Hailo-8 throughput despite unchanged INT8 ops, and how should CI/CD configure host streaming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1723", "title": "Edge TPU Multi-Model Cache Thrashing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the Coral cascade's 80% idle time and fix SRAM weight thrashing between the 5.5MB and 4.5MB models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1724", "title": "Canary Rollout of INT8 Model on Coral Edge TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you structure the rollout, and what specific telemetry do you monitor to decide whether to advance or rollback the deployment?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1725", "title": "Dataflow Accelerator Viability for Multi-Camera Edge Pipeline", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the dataflow accelerator sustain four 1080p30 streams at 50 GOPS per frame, and what system-level constraints drive the decision?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1726", "title": "Edge TPU Activation Memory Constraint Evaluation", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Given the Coral's strict requirement for INT8 quantization and limited on-chip SRAM (~8MB), how would you choose between downsampling, CPU/TPU splitting, and receptive-field changes when Edge TPU activations exceed SRAM?", "chain_ids": ["edge-chain-auto-secondary-003-20"], "chain_positions": {"edge-chain-auto-secondary-003-20": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1727", "title": "Adversarial Input Purification on Qualcomm Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is an INT8 input purification network feasible on the Cloud AI 100, and what compute and memory overhead would it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1728", "title": "Sizing Dynamic Batching for LLM Prefill on Edge", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is the Cloud AI 100 compute-bound or memory-bound during prefill, and what dynamic-batching limit meets the 200ms TTFT SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1729", "title": "Sizing MobileNetV2 for Google Coral Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose the MobileNetV2 input resolution and depth multiplier to achieve 30 FPS within Coral Edge TPU constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1730", "title": "Multi-Model Pipelining on Hailo-8 Dataflow Architecture", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the detector and OCR networks be swapped sequentially or co-compiled on Hailo-8 to meet 30 FPS, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1731", "title": "Scaling Video Analytics on Qualcomm Cloud AI 100", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many accelerators are needed for 500 streams at 150 GOPS and 30 FPS, and what is the peak power draw?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1732", "title": "Data Pruning for Hailo-8 Streaming Limits", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should Hailo-8 streaming constraints shape the data pruning strategy, and what is the max parameter count at 4 GB/s and 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1733", "title": "Optimizing 4K Image Pipelines for Cloud AI 100", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you partition decoding, resizing, and normalization between the host and accelerator to keep 4K 120 FPS inference from starving?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 3}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1734", "title": "Streaming Data Validation for Hailo-8 Edge Inference", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the architectural tradeoffs between performing lightweight data quality checks on the host CPU versus running a small validation model on the Hailo-8 for 4x 1080p 30 FPS streams?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 3}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1735", "title": "On-Device Active Learning Data Selection", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you size and place the uncertainty-sampling pipeline on Jetson Orin so it fits memory, uses the DLA, and respects the 5 Mbps uplink?", "chain_ids": ["edge-chain-auto-secondary-003-22"], "chain_positions": {"edge-chain-auto-secondary-003-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1736", "title": "Resource-Constrained Drift Detection on Jetson Orin", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you implement real-time data drift detection on Jetson Orin without disrupting the main GPU or DLA inference within 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1737", "title": "Zero-Copy Pipeline Design on Unified Edge Architectures", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign the four-4K-camera Jetson Orin pipeline to use zero-copy buffers and eliminate CPU, GPU, and DLA transfer overhead?", "chain_ids": ["edge-chain-auto-secondary-008-15"], "chain_positions": {"edge-chain-auto-secondary-008-15": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1738", "title": "Encoder-Decoder Sizing on Qualcomm AI 100", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which translation model would you deploy on Cloud AI 100, the 7B decoder-only or 3B encoder-decoder, and how would you batch it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1739", "title": "Energy-Aware Operator Selection on Cloud AI 100", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the execution strategy to maximize INT8 throughput under 75W while minimizing LPDDR4x access energy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1740", "title": "Sizing a 70B LLM for Qualcomm Cloud AI 100", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What extreme quantization scheme would fit a 70B LLM and KV cache into 32 GB on the Cloud AI 100, and what are the runtime tradeoffs?", "chain_ids": ["edge-chain-auto-secondary-006-03"], "chain_positions": {"edge-chain-auto-secondary-006-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1741", "title": "Quantized Edge TPU Fairness Evaluation Strategy", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate intersectional fairness for the INT8 face detector under the 2W deployment constraints?", "chain_ids": ["edge-chain-auto-secondary-009-28"], "chain_positions": {"edge-chain-auto-secondary-009-28": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1742", "title": "Evaluating GPU to Edge TPU Migration", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you migrate the 20 GOPS, 50 FPS detector from an FP16 GPU to the Coral Edge TPU and verify it fits the 4 TOPS INT8 limit?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1743", "title": "Dual-Model QoS Shedding on Cloud AI 100", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a graceful degradation ladder on a 32 GB accelerator to preserve premium quality while serving free-tier fallback traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1744", "title": "Fused Attention Optimization on Qualcomm Cloud AI 100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What fused attention strategy would you use on Cloud AI 100 to remove LPDDR4x round-trips, and how much bandwidth would it save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1745", "title": "Distilling a Hybrid Object Detector for Jetson Orin", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design feature distillation and size the student CNN-Transformer to hit 30 FPS while using the Orin's Ampere GPU and DLA efficiently?", "chain_ids": ["edge-chain-auto-secondary-008-22"], "chain_positions": {"edge-chain-auto-secondary-008-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1746", "title": "Heterogeneous Routing on NVIDIA Jetson Orin", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route 24 HD video streams across Jetson Orin's GPU and DLAs to maximize throughput while staying within 60W?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1747", "title": "Hailo-8 Multi-Process Memory-Mapped Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you share the 120 MB Hailo-8 backbone across three containers while staying under a 200 MB host memory budget?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1748", "title": "Sizing Paged Memory for Multi-Tenant LLM Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design continuous batching for a 13B INT8 model on 32 GB Cloud AI 100 to avoid KV-cache fragmentation and OOMs?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1749", "title": "Mixed-Precision Perception on Jetson Orin", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a mixed-precision Jetson Orin deployment for the 5B multimodal model to meet 4-camera 30 FPS under 60W?", "chain_ids": ["edge-chain-auto-secondary-011-30"], "chain_positions": {"edge-chain-auto-secondary-011-30": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1750", "title": "Hardware-Aware Shadow Deployment on Jetson Orin", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is shadow-deploying the INT8 candidate on Jetson Orin feasible, and how would you partition production and candidate workloads?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 2}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1751", "title": "Optimizing ViT Operator Fallback on Cloud AI 100", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you rewrite the ViT to use Cloud AI 100 supported operators or write custom kernels for unsupported attention ops, and why?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1752", "title": "Multi-Model Serving Strategy on Hailo-8 Accelerator", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the Hailo-8 detector and feature extractor be temporally multiplexed or co-compiled into one spatial context, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1753", "title": "Sizing Object Detection for Hailo-8 Dataflow Limits", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can one Hailo-8 run the 25M-parameter, 50 GOps model for four 1080p 30 FPS feeds, and what is the primary bottleneck?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 3}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1754", "title": "Sizing Telemetry for Coral Edge TPU Fleet", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What telemetry architecture, metrics, cadence, and central ingestion sizing would you use for 5,000 nodes over 50 Kbps links?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1755", "title": "Hailo-8 Dataflow Scheduling for High-Resolution Detection", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule and fuse the YOLOv8 layers to maintain 30 FPS without making PCIe bandwidth the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1756", "title": "Camera Stream Sizing for Cloud AI 100", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many 30 FPS streams can the Cloud AI 100 support for a 2.5 TOPS-per-frame model under the 60W chassis power cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1757", "title": "Privacy Guardrail Pipeline on Dataflow Edge", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you design this dual-model pipeline and size the required host-accelerator memory bandwidth given the hardware constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1758", "title": "Edge TPU Storage Format and I/O Throughput Sizing", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What storage format should the Coral pipeline use for hard negatives on eMMC, and how would you size I/O to keep the Edge TPU fed?", "chain_ids": ["edge-chain-auto-secondary-008-32"], "chain_positions": {"edge-chain-auto-secondary-008-32": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1759", "title": "Multi-Camera Streaming Edge Sizing", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the system handle 20 uncompressed 1080p60 camera streams for a 15 GOPS/frame feature extractor, and where is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1760", "title": "Optimizing ViT Attention Dataflow on Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should QxK^T use weight-stationary or output-stationary dataflow, and how would you tile it to minimize memory access on Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1761", "title": "Edge TPU Activation and Weight Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you size the CNN weights, activations, and intermediate tensors so the Coral Edge TPU keeps the detector in one INT8 subgraph?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1762", "title": "Optimizing Object Detection Graph for Coral Edge TPU", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate the tradeoff between modifying the graph to use natively supported INT8 operators versus pipelining the host-side execution. Should you modify the SSD graph or pipeline the fallback?", "chain_ids": ["edge-chain-auto-secondary-006-28"], "chain_positions": {"edge-chain-auto-secondary-006-28": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1763", "title": "Hailo-8 End-to-End Latency Decomposition", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you break down the 15 ms latency across preprocessing, PCIe transfers, accelerator compute, and postprocessing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1764", "title": "Hardware-Aware NAS for Jetson Orin DLA", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you constrain the NAS search space and cost model so the Orin DLA-only perception model is compatible and meets latency targets?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1765", "title": "Pruning Tradeoffs for Dataflow Edge Accelerators", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 4-camera Hailo-8 detector, would you use unstructured pruning or structured channel pruning to fit PCIe and 2.5W limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1766", "title": "Hailo-8 INT8 Quantization Strategy for Dataflow Streaming", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the system-level impact of choosing per-channel versus per-tensor INT8 quantization on the streaming architecture's throughput.", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 3}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1767", "title": "Roofline Optimization on Qualcomm Cloud AI 100", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using roofline analysis on Cloud AI 100, is the 150 GOPS ViT memory-bound or compute-bound, and should you prioritize pruning or batching?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 4}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1768", "title": "Sizing Llama-3 8B KV Cache for AI 100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you calculate the maximum batch size and context length for an 8B INT8 LLM on 32 GB Cloud AI 100 and improve utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1769", "title": "Shadow Deployment Quantization Requirement on Coral TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To successfully compile the new candidate model and deploy it to receive duplicated live traffic, what specific numerical precision format must the model be fully converted to?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1770", "title": "Google Coral Edge TPU Activation Data Type Requirement", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What data format must all activations and weights use before compiling the FP32-trained model for the 2W Coral Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-003-20"], "chain_positions": {"edge-chain-auto-secondary-003-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1771", "title": "Identifying Power Side-Channel Attacks on Edge Accelerators", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a power side-channel attack, and what physical property of the Cloud AI 100 would an attacker monitor?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1772", "title": "Hailo-8 Host-Side Data Stream Validation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "In this architecture, where must data quality checks and schema validation occur, and what happens if corrupted data is streamed to the device?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1773", "title": "Recall KL Divergence for Edge Drift", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the mathematical definition of Kullback-Leibler (KL) divergence, and is it a symmetric metric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1774", "title": "Define Fail-Operational vs Fail-Safe on Cloud AI 100", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the exact difference between fail-safe and fail-operational states in this Cloud AI 100 edge deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1775", "title": "Jetson Orin DLA Offloading for Distilled Models", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the specific name of the dedicated fixed-function hardware accelerator on the Jetson Orin designed for this type of inference?", "chain_ids": ["edge-chain-auto-secondary-008-22"], "chain_positions": {"edge-chain-auto-secondary-008-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1776", "title": "Identifying Jetson Orin Accelerator Routing", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific NVIDIA software configuration feature allows you to explicitly route incoming inference requests to either the GPU or the DLA for load balancing?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1777", "title": "Hailo-8 Memory Architecture and Power Efficiency", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental memory architecture of the Hailo-8 dataflow accelerator, and what is its rated INT8 performance and power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1778", "title": "Jetson Orin Peak INT8 Performance Recall", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "To set expectations for the inference latency of your quantized model, what is the theoretical peak INT8 compute performance of the NVIDIA Jetson Orin, and what hardware components contribute to this?", "chain_ids": ["edge-chain-auto-secondary-011-30"], "chain_positions": {"edge-chain-auto-secondary-011-30": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1779", "title": "Qualcomm Cloud AI 100 Compute and Memory Recall", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the maximum INT8 compute throughput and total memory capacity of this specific accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1780", "title": "Hailo-8 Local Memory Architecture Recall", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the capacity of the onboard local DRAM on a Hailo-8 chip available for storing model weights and activations?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1781", "title": "Recall NVIDIA Jetson Orin DLA Purpose", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does DLA stand for, and what type of operations is it primarily optimized for in this architecture?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1782", "title": "Hailo-8 Dataflow Memory Constraints for Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific hardware memory characteristic of the dataflow accelerator dictates this spatial operator scheduling approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1783", "title": "Jetson Orin DLA Identification for Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific, separate hardware accelerator on the Jetson Orin is likely processing the INT8 inference workload?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1784", "title": "Sparsity Support on Google Coral Edge TPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Does 75% unstructured weight pruning reduce inference latency on the Edge TPU, and what pruning approach would actually help?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 0}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1785", "title": "Hailo-8 Hardware Specs for Model Cards", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the peak compute throughput, required precision, and power consumption of the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1786", "title": "Edge Inference Logging Storage Format", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What binary, row-oriented storage format native to the TensorFlow ecosystem is designed for efficiently logging sequential, append-only inference results to local storage?", "chain_ids": ["edge-chain-auto-secondary-008-32"], "chain_positions": {"edge-chain-auto-secondary-008-32": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1787", "title": "Qualcomm Cloud AI 100 Memory Capacity Recall", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the exact capacity and memory type of the Qualcomm Cloud AI 100's on-board memory for sizing ring buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1788", "title": "NVIDIA DLA Primary Compute Architecture on Jetson Orin", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific type of hardware architecture does the DLA use at its core to compute dense matrix multiplications?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 0}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1789", "title": "Hailo-8 Power and Performance Specification Recall", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the typical power consumption and peak INT8 performance of the Hailo-8 under load?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1790", "title": "Google Coral Edge TPU Precision Requirement", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What quantization requirement must the model meet to run on the Coral Edge TPU instead of falling back to the CPU?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1791", "title": "Qualcomm Cloud AI 100 Physical Memory Capacity Recall", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total memory capacity and specific memory technology used on the Qualcomm Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1792", "title": "Qualcomm Cloud AI 100 Toolchain and Specs Recall", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the name of the primary software SDK and compiler toolchain provided by Qualcomm used to convert and optimize standard ONNX models for this specific hardware?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1793", "title": "Google Coral Edge TPU Supported Data Type", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific data type must all model weights and activations be quantized to for successful execution on the Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1794", "title": "Shadow Deployment Design on Qualcomm Cloud AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you shadow the new model on live 60 FPS streams while preserving primary latency and fitting within edge resource constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1795", "title": "On-Premise LLM Fine-Tuning Memory Specification", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What activation memory strategy would let the 7B fine-tuning run within 32 GB by trading Cloud AI 100 compute for memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1796", "title": "Adversarial Patch Defense Spec on Jetson Orin", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an adversarial-patch defense for 4K 30 FPS on Jetson Orin while sharing the GPU, DLA, memory, and 275 TOPS budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1797", "title": "Continuous Batching for Multi-Camera Object Detection", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you batch 8 asynchronous 30 FPS camera streams to maximize throughput while keeping per-frame latency under 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1798", "title": "Multi-Stream Video Analytics on Qualcomm Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you modify EfficientNet-lite and batching on the Cloud AI 100 to maximize 30 FPS stream density without becoming memory-bandwidth bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1799", "title": "Dual-Model Security Pipeline on Google Coral Edge TPU", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you orchestrate the detector, embedding model, INT8 quantization, and host-device routing to meet 15 FPS on a 4 TOPS INT8, 2W accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1800", "title": "Jetson Orin Fleet Compute Cost", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can one Jetson Orin support two 150 TOPS video streams, and what is the daily electricity cost for 1,000 devices at 60W and $0.15/kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1801", "title": "Edge TPU Visual Inspection Data Quality Pipeline", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design an edge-only data validation gate for noisy Coral Edge TPU camera frames within the 2W INT8 constraints?", "chain_ids": ["edge-chain-auto-secondary-009-23"], "chain_positions": {"edge-chain-auto-secondary-009-23": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1802", "title": "Data Curation for Hailo-8 INT8 Quantization", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate an INT8 calibration dataset for Hailo-8 so activation outliers are captured and quantization accuracy is preserved?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1803", "title": "Hailo-8 Traffic Camera Drift Detection Specification", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you detect seasonal data drift at the edge on Hailo-8 without adding host memory bandwidth pressure or disrupting inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1804", "title": "Hailo-8 Multi-Stream Zero-Copy DMA Pipeline Design", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build the host-to-Hailo-8 memory pipeline for four 1080p30 streams to avoid frame drops and minimize CPU copies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1805", "title": "Energy-Aware Inference Architecture on Jetson Orin", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should the data flow and operator selection be designed to stay within the 60W power budget while maximizing compute utilization?", "chain_ids": ["edge-chain-auto-secondary-005-12"], "chain_positions": {"edge-chain-auto-secondary-005-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1806", "title": "On-Premise Intersectional Fairness Specification for Diagnostic Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify the architectural requirements to track and compute these fairness metrics across various demographic subgroups while co-locating with the primary diagnostic workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1807", "title": "Inference Compute Specification for Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you balance batch size and operational intensity to maximize the INT8 compute utilization without hitting the LPDDR4x memory bandwidth wall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1808", "title": "Degradation Ladder for Autonomous Edge Perception", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify a fail-operational degradation ladder for Jetson Orin when thermal throttling caps power at 15W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1809", "title": "AOT Compilation Strategy for Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design AOT graph compilation and partitioning so unsupported ops do not cripple Cloud AI 100 INT8 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1810", "title": "Knowledge Distillation for Hailo-8 Dataflow", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the student model and distillation process given its INT8 dataflow execution and no local DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1811", "title": "Hailo-8 Multi-Camera Stream Load Balancing", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route 8 camera streams across 4 Hailo-8 accelerators while minimizing host memory contention and avoiding accelerator starvation?", "chain_ids": ["edge-chain-auto-secondary-006-09"], "chain_positions": {"edge-chain-auto-secondary-006-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1812", "title": "Multi-Process Inference Optimization on Google Coral Edge TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you load one MobileNetV2 Edge TPU model across four isolated processes while minimizing RAM use and cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1813", "title": "Design Memory Management for Multi-Model Pipeline on Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a memory management specification to handle peak pressure, prevent OOMs, and minimize fragmentation for concurrent GPU/DLA inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1814", "title": "INT8 Dataflow Specification for Hailo-8", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What precision-reduction and deployment pipeline efficiently runs the FP32 detector on this architecture without host-memory streaming bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1815", "title": "Jetson Orin Multi-Model TensorRT Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert and delegate the detector and segmentation models on Jetson Orin to maximize INT8 performance under 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1816", "title": "Vision Model Memory Profiling for Google Coral Edge TPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What end-to-end quantization, compilation, and memory analysis pipeline would you specify for deploying MobileNetV2-SSD on Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1817", "title": "Dataflow NAS Constraints for Hailo-8 Accelerator", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the NAS cost model and search space account for Hailo-8's no-DRAM dataflow architecture and host-memory bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1818", "title": "Edge TPU Layer Fusion and Operator Scheduling for INT8 Vision Model", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule and fuse operators on the Coral Edge TPU to keep intermediate tensors in SRAM and avoid USB CPU fallbacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1819", "title": "Hailo-8 Stream Profiling and Latency Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What comprehensive profiling architecture would you specify to determine whether Hailo-8 latency comes from preprocessing, PCIe bandwidth, or dataflow stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1820", "title": "INT8 Quantization Strategy for Google Coral Deployment", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fully quantize and refactor the MobileNetV2 model so it runs entirely on the Coral Edge TPU without CPU fallbacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1821", "title": "On-Device Privacy Guardrails for Intersection Monitoring", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you integrate on-device face blurring and bias-audit logging while keeping the Coral Edge TPU pipeline fully INT8-compatible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1822", "title": "Fail-Safe Industrial Anomaly Detection on Edge TPU", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design deterministic Coral Edge TPU inference with watchdogs and safety guarantees despite limited operator support?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1823", "title": "Factory Defect Detection Storage Pipeline", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you format, compress, store, and stream 100 8MB images per second so the accelerator is not starved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1824", "title": "Real-Time Sensor Ingestion Pipeline for Jetson Orin", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the camera and LiDAR ingestion pipeline on Jetson Orin to meet a 30ms latency budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1825", "title": "Design a Dataflow Accelerator Pipeline for ResNet-50 on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which dataflow strategy would you use for ResNet-50 layers on Hailo-8 at 120 FPS, and how would you buffer weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1826", "title": "Thermal Specification for Edge TPU Burst Workloads", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal management specification would prevent Coral Edge TPU throttling during 10-second 2W bursts at 45°C ambient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1827", "title": "Hailo-8 Transformer KV-Cache Streaming Architecture Specification", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you partition a 125M-parameter transformer on Hailo-8 given O(n²) attention and host-streamed KV-cache bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1828", "title": "Hybrid Compute Partitioning for Coral Edge TPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you partition the CNN detector and dynamic tracker between the Coral Edge TPU and host CPU under a 5W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1829", "title": "Data Pruning for Edge TPU Object Detection", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you select a 500k-image coreset from 5 million images to maximize accuracy after INT8 deployment on the Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1830", "title": "Designing Kernel Fusion for ViTs on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What kernel fusion strategy would you use for a ViT on Jetson Orin to reduce LayerNorm, GELU, and launch overhead within 60W?", "chain_ids": ["edge-chain-auto-025-14"], "chain_positions": {"edge-chain-auto-025-14": 1}, "chain_tiers": {"edge-chain-auto-025-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1831", "title": "Latency Budgeting for Defect Detection on Edge TPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you allocate the 25ms camera-to-reject-arm latency budget for a Coral Edge TPU defect detection pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1832", "title": "Qualcomm Cloud AI 100 Multi-Model Serving Architecture", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design model serving for four ResNet-50 variants and 16 camera streams on one Qualcomm Cloud AI 100 to maximize utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1833", "title": "Observability for On-Premise AI 100 Accelerators", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you design the observability stack to ensure high reliability without overwhelming the factory's limited external network bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1834", "title": "Roofline Analysis for MobileNetV2 on Qualcomm Cloud AI 100", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use roofline analysis to diagnose if MobileNetV2 is compute-bound or memory-bound on the Qualcomm Cloud AI 100?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 3}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1835", "title": "Optimizing INT8 Matrix Multiplication on NVIDIA Jetson Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize the underperforming INT8 GEMM on an edge accelerator for Tensor Core utilization, coalescing, and occupancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1836", "title": "Diagnosing Inefficient Neural Network Inference on Hailo-8", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of the performance bottleneck, considering the Hailo-8's unique architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1837", "title": "Optimizing INT8 Matrix Multiplication on Qualcomm Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze and optimize the INT8 GEMM kernel on Cloud AI 100 for memory coalescing, occupancy, and INT8 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1838", "title": "Optimizing Transformer Inference on Qualcomm Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize the transformer attention GEMMs on Cloud AI 100 using execution model tuning, coalescing, and INT8 Tensor Cores?", "chain_ids": ["edge-chain-auto-secondary-007-23"], "chain_positions": {"edge-chain-auto-secondary-007-23": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1839", "title": "Evaluating Qualcomm Cloud AI 100 for Edge ML Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare the Cloud AI 100 with a high-end embedded GPU (like Jetson Orin) and a low-power, general-purpose CPU (like Intel Atom) for smart-camera inference across efficiency, programmability, and TCO?", "chain_ids": ["edge-chain-auto-secondary-008-14"], "chain_positions": {"edge-chain-auto-secondary-008-14": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1840", "title": "Edge AI Deployment: Selecting the Optimal Accelerator for Real-time Object Detection with Hailo-8", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare the Hailo-8 against a low-power ARM CPU with integrated NPU and a small embedded GPU, and what trade-offs justify your choice?", "chain_ids": ["edge-chain-auto-secondary-008-13"], "chain_positions": {"edge-chain-auto-secondary-008-13": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1841", "title": "Jetson Orin: Edge Accelerator Trade-offs for Object Detection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you justify your choice, emphasizing the trade-offs across programmability, inference throughput, power budget, and memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1842", "title": "Optimizing Edge ML Inference on Google Coral Edge TPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks likely cause Coral Edge TPU latency over 50ms, and what quantifiable fixes would meet the power and performance limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1843", "title": "Quantized Conv Layer Performance on Google Coral Edge TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Assuming a perfectly efficient weight-stationary systolic array execution, neglecting data movement overheads and only considering MAC operations, what is the theoretical minimum time in milliseconds for this layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1844", "title": "Systolic Array Dataflow Optimization for Qualcomm Cloud AI 100", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What could be the root cause of this underutilization and increased latency, specifically considering the characteristics of transformer layers and the accelerator's architecture?", "chain_ids": ["edge-chain-auto-024-09"], "chain_positions": {"edge-chain-auto-024-09": 1}, "chain_tiers": {"edge-chain-auto-024-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1845", "title": "Dataflow and On-chip Memory Utilization", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should on-chip SRAM and weight-stationary versus output-stationary dataflows minimize latency for the 1x1 convolution?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1846", "title": "Edge TPU Object Detection Cost Analysis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the annual electricity cost for 100 Coral Edge TPU factory deployments running 24/7 with a 5W host and 75% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1847", "title": "Edge AI Perception Model Costing on Qualcomm AI 100", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate annual INT8 operations, accelerator-hours, and total dollar cost for 100,000 equipped vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1848", "title": "Edge TPU Inference Cost and Scalability", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many Coral Edge TPUs are required per 30 FPS camera, and what power and operational cost implications follow?", "chain_ids": ["edge-chain-auto-026-15"], "chain_positions": {"edge-chain-auto-026-15": 2}, "chain_tiers": {"edge-chain-auto-026-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1849", "title": "Edge LLM Inference Cost on Qualcomm Cloud AI 100", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What maximum requests per second and cost per million inferences can one accelerator with 400 TOPS INT8 deliver at 80% of peak on this LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1850", "title": "Edge Model Evaluation: Performance and Cost on NVIDIA Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What FPS and annual electricity cost do Models Alpha and Beta achieve on Jetson Orin, and which would you recommend?", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 3}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1851", "title": "Edge AI Costing: Large-Scale Vision Deployment on Hailo-8", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the per-device FPS, fleet energy use, 5-year energy cost, and hidden deployment costs for 10,000 Hailo-8 devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1852", "title": "Edge AI Compute Cost and Performance Estimation for Google Coral TPU Deployment", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the maximum theoretical inferences per second (FPS) per device, and what is the annual electricity cost for the 1000-device fleet?", "chain_ids": ["edge-chain-auto-026-15"], "chain_positions": {"edge-chain-auto-026-15": 1}, "chain_tiers": {"edge-chain-auto-026-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1853", "title": "Jetson Orin LLM Inference VRAM Budgeting Components", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory beyond the 7B BF16 weights must fit in Jetson Orin's 32 GB LPDDR5, and what formulas estimate each footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1854", "title": "VRAM Budgeting for a Quantized Vision Transformer on Hailo-8", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can this 5M-parameter INT8 vision transformer fit in the 8 MB SRAM, and how would you budget and optimize weights, activations, and KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1855", "title": "VRAM Budgeting for LLM Inference on Qualcomm Cloud AI 100", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture fits the Cloud AI 100's 32 GB VRAM for 2048-token inference, and how would AdamW fine-tuning change the budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1856", "title": "Hailo-8 VRAM Budgeting for Edge LLM Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you budget and optimize weights, activations, and KV-cache on Hailo-8 to stop OOMs and meet sub-100 ms conversational inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1857", "title": "Memory Hierarchy Tradeoffs for LLM Deployment on Qualcomm Cloud AI 100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do LPDDR4x capacity, bandwidth, latency, and the 75W power limit constrain real INT8 throughput on Cloud AI 100 for this large LLM?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 1}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1858", "title": "Optimizing Large Language Model Deployment on NVIDIA Jetson Orin", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you manage the 20 GB INT8 model and activations on Jetson Orin to meet sub-100 ms per-token latency within 32 GB LPDDR5 and 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1859", "title": "Hailo-8 Memory Bandwidth for Convolution Layer", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For the 3x3 convolution, how many MACs are required, what input-activation bandwidth is needed in 1 ms with 4x reuse, and is it memory-bound versus a 68 GB/s bandwidth limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1861", "title": "Optimizing Large Language Model Deployment on NVIDIA Jetson Orin with Activation Memory Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What activation recomputation and quantization strategy would let the 10B FP16 LLM run on Jetson Orin within 32 GB LPDDR5 and 60W?", "chain_ids": ["edge-chain-auto-secondary-003-21"], "chain_positions": {"edge-chain-auto-secondary-003-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1862", "title": "Coral Edge TPU: Activation Memory & Compute-Memory Tradeoff for Real-time Segmentation", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which model do you recommend and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1863", "title": "Jetson Orin LLM Deployment: Activation Memory Bottleneck & Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose Jetson Orin OOMs from forward activations in the 7B INT8 transformer and reduce peak memory without killing throughput?", "chain_ids": ["edge-chain-auto-secondary-003-21"], "chain_positions": {"edge-chain-auto-secondary-003-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1864", "title": "Hailo-8 Memory-Mapped Weight Loading Strategies for Shared Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should mmap be used so multiple Hailo-8 inference processes share model weights and avoid cold-start latency?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1865", "title": "Memory-Mapped Inference on Google Coral Edge TPU: Cold Start Analysis", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes Coral cold-start latency with memory-mapped weights, and what strategy would reduce page faults and sustained storage stalls?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1866", "title": "Optimizing Memory-Mapped Model Loading on Jetson Orin", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you mmap the 20 GB LLM weights to minimize cold start while preserving enough 32 GB LPDDR5 for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1867", "title": "Optimizing Memory-Mapped Inference on Hailo-8 for Edge AI", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use mmap for 500 MB model weights to achieve fast boot or model swaps, avoid page-fault pitfalls, and share weights across processes?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 3}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1868", "title": "Optimizing Memory-Mapped Inference on Google Coral Edge TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and reduce the several-hundred-ms cold start for a 50 MB memory-mapped Coral TFLite model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1869", "title": "Optimizing Memory-Mapped Large Model Inference on Qualcomm Cloud AI 100", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design mmap-based inference for a 70B INT8 model that exceeds Cloud AI 100's 32 GB, while minimizing cold start and sharing weights?", "chain_ids": ["edge-chain-auto-secondary-008-26"], "chain_positions": {"edge-chain-auto-secondary-008-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1870", "title": "Optimizing Data Movement on NVIDIA Jetson Orin", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are 1080p frame transfers slow from pageable camera buffers on Jetson Orin, and which pinned or zero-copy techniques would fix them?", "chain_ids": ["edge-chain-auto-secondary-008-15"], "chain_positions": {"edge-chain-auto-secondary-008-15": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1871", "title": "Optimize Data Movement for Edge TPU Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize host-to-Edge TPU data movement for the 512x512x3 INT8 detector when transfers consume 30% of latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1872", "title": "Qualcomm Cloud AI 100 Data Movement Bottleneck Diagnosis", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose host-device data movement overhead on Cloud AI 100, and what symptoms would indicate inefficient LPDDR4x or DMA use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1873", "title": "Real-time 4K Video Object Detection Data Movement Strategy", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What data movement strategy would you use on Jetson Orin to process 4K30 with CPU pre/postprocessing and INT8 GPU inference within 60W?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1874", "title": "Edge Inference Data Movement: Coral vs. CPU/GPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do the integrated-GPU SoC and attached Edge TPU differ in DMA, zero-copy, and host-device data movement for high-resolution video inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1875", "title": "Qualcomm Cloud AI 100: Optimizing GenAI Data Movement", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design host-to-Cloud AI 100 tensor transfers for large generative inference to maximize throughput and minimize energy?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1876", "title": "Optimizing Data Movement for Edge AI on Jetson Orin", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and optimize the 15ms Jetson Orin data-movement bottleneck using pinned memory, async copies, or zero-copy mapping?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1877", "title": "Designing for Memory Pressure on Qualcomm Cloud AI 100", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you manage memory pressure for a 28 GB LLM on a 32 GB accelerator to prevent OOMs under real-time peak load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1878", "title": "Memory-Efficient LLM Fine-tuning on NVIDIA Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you avoid OOM during Jetson Orin fine-tuning, what is the max micro-batch size, and how does gradient accumulation set effective batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1879", "title": "LLM Deployment with Memory Constraints on Qualcomm Cloud AI 100", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can a 15B INT8 LLM still OOM on a 32 GB device, and what memory-pressure strategies and OS effects would you consider?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1880", "title": "Memory Optimization for Large Language Model Deployment on NVIDIA Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which Jetson Orin deployment architecture is more stable under memory pressure, and how would you mitigate OOMs, fragmentation, and future fine-tuning needs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1881", "title": "Hailo-8 Memory Optimization for Real-time Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize memory and handle OOMs for an INT8 LLM on Hailo-8 when activations exceed on-chip memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1882", "title": "Edge TPU Memory Optimization for OOM Prevention", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix sporadic Coral Edge TPU OOMs from peak activations or fragmentation while maintaining 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1883", "title": "Optimizing Object Detection Latency on Google Coral Edge TPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you break down the 150 ms Coral Edge TPU frame latency to find whether capture, preprocessing, USB transfer, inference, or postprocessing dominates?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1884", "title": "Decomposing End-to-End Latency on Qualcomm Cloud AI 100 for Real-time Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose the 75ms end-to-end latency to identify the bottlenecks preventing you from meeting the 50ms target?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1885", "title": "Latency Decomposition for Real-time Inference on NVIDIA Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose the 100 ms object-detection latency budget into acquisition, preprocessing, inference, postprocessing, and output?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1886", "title": "Edge TPU Latency Decomposition for Real-time Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you allocate and measure the Coral Edge TPU drone pipeline's 50 ms latency budget across preprocessing, transfers, inference, and control?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 3}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1887", "title": "Optimizing Real-time Object Detection Latency on Hailo-8 for Edge AI", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose and optimize the Hailo-8 drone detection pipeline to reduce end-to-end latency from 150ms to the 100ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1888", "title": "Optimizing Object Detection on Jetson Orin: Dynamic Batching for Throughput and Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which batching strategy and scheduling policy would you use for 4K30 object detection to maximize throughput while keeping latency under 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1889", "title": "Optimizing Object Detection Latency and Throughput on Hailo-8 with Advanced Batching Strategies", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use static, dynamic, and continuous batching to balance 50 ms critical-event latency with maximum multi-stream throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1890", "title": "Diagnosing Latency Spikes with Batching on Qualcomm Cloud AI 100", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the static batch size of 4 causing tail latency spikes, and how would dynamic or continuous batching reduce them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1891", "title": "Optimizing Real-time Inference on NVIDIA Jetson Orin with Adaptive Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What batching and scheduling policy would you use to keep 90% of inferences under 50 ms while maximizing throughput?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1892", "title": "Jetson Orin: Latency vs. Throughput in Edge Inference Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you batch and schedule multi-camera YOLOv7 to maximize effective FPS while keeping every frame under 100ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1893", "title": "Real-Time Object Detection on Hailo-8: Analyzing Jank and ANR", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "As a senior ML Systems Engineer, how would you systematically analyze the root causes of these real-time performance violations given the stated constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1894", "title": "Diagnosing Latency Spikes on Qualcomm Cloud AI 100", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which profiling tools and traces would you use on the Qualcomm Cloud AI 100 to determine whether the latency spikes are compute, memory, or I/O bound?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1895", "title": "Optimizing Real-time Inference Latency on Jetson Orin", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the Jetson Orin pipeline to find whether the 150 ms inference latency is compute-, memory-, I/O-, or power-throttling-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1896", "title": "Optimizing Latency on Hailo-8: Profiling for Edge ML Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the accelerator and host pipeline to pinpoint the compute, memory, or I/O bottlenecks causing 50 ms latency violations?", "chain_ids": ["edge-chain-auto-secondary-010-01"], "chain_positions": {"edge-chain-auto-secondary-010-01": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1897", "title": "Optimizing Real-time Object Detection Latency on Google Coral Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the Coral Edge TPU pipeline and quantify why 30 ms TPU inference becomes over 80 ms end-to-end latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1898", "title": "Diagnosing Latency on Qualcomm Cloud AI 100 with Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use profiling and trace tools on the Qualcomm Cloud AI 100 to determine whether the 50 ms LLM latency miss is compute, memory, or I/O bound?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1899", "title": "Hailo-8 Edge Latency Bottleneck for Object Detection", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you identify whether compute, memory, or I/O is causing dropped frames on the Hailo-8 at 30 FPS, and what first mitigations would you try?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1900", "title": "Optimizing Latency on Edge Accelerators for LLM Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically find and fix the bottlenecks causing latency spikes beyond the 50 ms budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1901", "title": "Real-time Object Detection Latency Optimization on NVIDIA Jetson Orin", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and quantify the bottlenecks causing 1080p/30 FPS latency spikes above 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1902", "title": "Jetson Orin INT8 Quantization: Performance vs. Accuracy Trade-offs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you recover mAP and reduce false negatives on small objects after INT8 PTQ of YOLOv7?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1903", "title": "Quantizing a Large Language Model for Qualcomm Cloud AI 100 Deployment", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you quantize and deploy the FP16 LLM on the Qualcomm Cloud AI 100 to maximize INT8/INT4 performance while preserving accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1904", "title": "Designing a Mixed-Precision Strategy for Edge Deployment on Hailo-8", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What mixed-precision conversion and deployment strategy would you use on Hailo-8 to maximize throughput and power efficiency within 1% of FP32 accuracy?", "chain_ids": ["edge-chain-auto-secondary-011-29"], "chain_positions": {"edge-chain-auto-secondary-011-29": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1905", "title": "Quantization Strategy for Coral Edge TPU Deployment", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantize and validate FP32 ResNet-50 for the INT8-only Coral Edge TPU to hit 10 ms latency with under a 2-point accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1906", "title": "Optimizing Mixed-Precision Inference on Hailo-8", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you adapt the FP32 LLM for mixed-precision inference on an INT8-optimized edge accelerator while evaluating accuracy, latency, and power tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1907", "title": "Optimizing Edge Deployment: Mixed-Precision Training for Coral Edge TPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you compare Model A and Model B for INT8-only Coral deployment across accuracy, throughput, energy, and quantization risks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1908", "title": "Optimizing Large Language Model Inference with Mixed-Precision on Qualcomm Cloud AI 100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What mixed-precision deployment strategy fits the FP32 LLM within 32 GB memory while minimizing latency and preserving accuracy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1909", "title": "Optimizing Real-time Object Detection on NVIDIA Jetson Orin via Mixed-Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose bottlenecks and design a quantified mixed-precision plan for YOLOv8-L on Jetson Orin to reach 30 FPS without OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1910", "title": "Extreme Quantization Strategy for Large Models on Google Coral Edge TPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use extreme sub-4-bit quantization for the oversized model on the INT8-only Coral Edge TPU while meeting <50 ms latency and <5% accuracy loss?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1911", "title": "Deploying Sub-4-bit LLMs on Qualcomm Cloud AI 100: Balancing Precision and Performance", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement and evaluate sub-4-bit quantization for the LLM while maintaining accuracy and exploiting its INT8 hardware?", "chain_ids": ["edge-chain-auto-secondary-006-03"], "chain_positions": {"edge-chain-auto-secondary-006-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1912", "title": "Sub-4-bit Vision Model Deployment on Hailo-8", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you quantize the object detector for Hailo-8 to achieve at least 85% mAP and 30 FPS under its 26 TOPS, 2.5W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1913", "title": "Extreme Quantization Deployment on Qualcomm Cloud AI 100", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you prepare, deploy, and validate the 2-bit AWQ LLM on the Cloud AI 100 given limited native sub-4-bit support?", "chain_ids": ["edge-chain-auto-secondary-006-03"], "chain_positions": {"edge-chain-auto-secondary-006-03": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1914", "title": "Extreme Quantization of Vision Transformer on Hailo-8", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you quantize a vision transformer to 2-bit weights and map it efficiently onto Hailo-8's INT8 engine without excessive accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1915", "title": "Hailo-8 Edge Power Budgeting for Perception", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize Hailo-8 power so a 10 TOPS perception model sustains 30 FPS within the 5W module budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1916", "title": "Power Budgeting for Real-time Edge TPU Deployment", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you ensure a 200 GOPS INT8 model sustains 10 FPS on the 2W Coral Edge TPU, and what power trade-offs would you analyze?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1917", "title": "NVIDIA Jetson Orin Thermal Throttling: Sustained Performance Recall", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause for this performance degradation, and what immediate thermal management strategy should be applied?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1918", "title": "Hailo-8 Thermal Throttling Analysis", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the device dropping to 15 TOPS at 35°C, and how would you prevent it from throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1919", "title": "Quantifying Sustained Performance Degradation due to Thermal Throttling on Qualcomm Cloud AI 100", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With power throttled from 75W to 60W, what sustained INT8 TOPS, percentage reduction, and general TOPS_effective formula do you get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1920", "title": "Hailo-8 Thermal Design for Edge Deployment", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you ensure the Hailo-8 doesn't thermal throttle under the worst-case ambient conditions?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1921", "title": "Coral Edge TPU Thermal Throttling in High-Ambient Environments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign cooling and workload control so the Coral Edge TPU sustains performance at 45°C without latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1922", "title": "Thermal Performance Comparison for Edge AI Deployment", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare the Cloud AI 100 and Accelerator X for throttling-free sustained inference from 0°C to 45°C?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1923", "title": "Optimizing Sustained Performance on NVIDIA Jetson Orin in Challenging Environments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose, mitigate, and manage thermal throttling so the drone vision system sustains performance at 40°C?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 4}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1924", "title": "Hailo-8 Edge Deployment: Sustained Performance under Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and quantify a thermal solution for Hailo-8 throughput drops below 10 TOPS in 50°C industrial operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1925", "title": "Google Coral Edge TPU Thermal Throttling in Edge Deployments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause of the Coral Edge TPU throughput dropping 50% after 5-10 minutes at 40°C, and how would you mitigate it?", "chain_ids": ["edge-chain-auto-secondary-006-16"], "chain_positions": {"edge-chain-auto-secondary-006-16": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1926", "title": "Energy-Aware Model Design for Real-Time Inference on Google Coral Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Coral Edge TPU model for 30 FPS with minimal energy per inference, considering memory access is far costlier than INT8 MACs?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1927", "title": "Qualcomm Cloud AI 100: INT8 Matrix Multiply Energy Analysis", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Estimate the total energy consumed by compute operations versus memory accesses. Which component dominates the energy budget, and by how much?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1928", "title": "Energy-Aware Platform Selection for Edge AI: Qualcomm Cloud AI 100 vs. Edge AI Accelerator X", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which platform is more energy-suitable for 1000 INT8 inferences/s with a 10 GB model, and how do compute, memory, and TDP trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1929", "title": "Energy-Efficient LLM Deployment on NVIDIA Jetson Orin", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile and optimize Jetson Orin LLM inference energy when both INT8 matmuls and LPDDR5 embedding/KV-cache accesses affect latency?", "chain_ids": ["edge-chain-auto-secondary-005-12"], "chain_positions": {"edge-chain-auto-secondary-005-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1930", "title": "Transformer LLM Deployment on NVIDIA Jetson Orin: Cost-Performance Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you deploy the 7B FP16/INT8 LLM on Jetson Orin to meet a 2048-token context and <100 ms per token latency?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1931", "title": "Optimizing Transformer Attention and KV-Cache on Edge AI Accelerators (Hailo-8)", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks cause 512-token latency and power spikes on the edge accelerator, and what quantified attention redesign would fix them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1932", "title": "MobileNet Architectural Benefits on Coral Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why are depthwise separable convolutions beneficial for CNNs on the INT8-only Coral Edge TPU with 4 TOPS and a 2W power budget?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1933", "title": "Optimizing MobileNetV3 for Edge Deployment on Qualcomm Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize MobileNetV3-Large on the Cloud AI 100 to reduce latency spikes and stay within the 75W power envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1934", "title": "Optimizing MobileNetV3 on Hailo-8 for Real-time Edge Inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the 25ms latency on Hailo-8 and optimize MobileNetV3-Large to reach 10ms while preserving accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1935", "title": "Designing an Efficient MobileNet for Google Coral Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and optimize MobileNetV2 for 50 FPS under 20ms and 2W on the INT8-only TPU?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1936", "title": "Edge AI Model Selection", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which model, MobileNetV2 or EfficientNetB0, would you deploy on the Jetson Orin, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1937", "title": "Edge AI Deployment: Optimizing CNN for Qualcomm Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign ResNet-50 using MobileNet/EfficientNet-style blocks to hit 10ms, 90% mAP, and the 75W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1938", "title": "Edge Deployment: Model Sizing for NVIDIA Jetson Orin", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What steps would you take to estimate the model's memory footprint and computational load, and propose strategies to ensure successful deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1939", "title": "Hailo-8 Deployment: Model Memory Footprint & Throughput", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much INT8 parameter memory does the 15M-parameter model need, what is its theoretical Hailo-8 throughput at 20 GOPS, and what do these imply?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1940", "title": "Coral Edge TPU Feasibility for MobileNetV3-Small: Memory and Performance Estimation", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on a feasibility estimation of memory footprint and INT8 operations, can the Coral Edge TPU meet the 30 FPS requirement?", "chain_ids": ["edge-chain-auto-secondary-003-29"], "chain_positions": {"edge-chain-auto-secondary-003-29": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1941", "title": "Hardware-aware NAS Constraints on Qualcomm Cloud AI 100", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the Cloud AI 100's LPDDR4x, 400 TOPS INT8, and 75W budget shape the NAS search space and objectives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1942", "title": "Hardware-aware NAS Deployment on NVIDIA Jetson Orin for Real-time Object Detection", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why might the general-purpose NAS model miss Jetson Orin latency and power targets, and how should hardware-aware NAS fix this?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1943", "title": "Hardware-aware NAS for Hailo-8: Latency and Memory Constraints", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you identify Hailo-8 layer bottlenecks and estimate whether a modified layer meets 5ms latency or 50KB feature-map memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1944", "title": "Hardware-aware NAS for Coral Edge TPU Deployment", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS strategy for Coral Edge TPU that enforces <30ms latency, <2W power, and INT8 constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1945", "title": "Hardware-Aware NAS for a Vision Model on Qualcomm Cloud AI 100", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS pipeline for the AI 100 to reduce the 150 GOPS, 12GB-activation baseline while keeping at least 88% mAP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1946", "title": "Hardware-aware NAS for Real-time Object Detection on NVIDIA Jetson Orin", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design and implement a hardware-aware NAS strategy that explicitly constrains the search space by the Jetson Orin's SRAM/LPDDR5 capacity, actual INT8 FLOPs, and measured inference latency?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1947", "title": "Hardware-Aware NAS for Edge TPU Deployment", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use hardware-aware NAS to design a 30 FPS object detector for the INT8-only Coral Edge TPU under its limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1948", "title": "Encoder-Decoder Selection for NLU on Google Coral Edge TPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture (encoder-only, decoder-only, or encoder-decoder) fits Coral Edge TPU NLU best under 4 TOPS INT8 and 2W, and why?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1949", "title": "Edge Deployment Tradeoffs: Encoder-Decoder Architectures on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which NLP architecture would you choose on the edge for both classification and summarization, and how would you meet memory, latency, and power limits?", "chain_ids": ["edge-chain-auto-secondary-011-27"], "chain_positions": {"edge-chain-auto-secondary-011-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1950", "title": "Edge TPU Deployment: Encoder-Decoder Model Latency Diagnosis", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural and deployment issues likely cause >500ms latency and thermal warnings for the INT8 encoder-decoder NLU model on Coral Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1951", "title": "Deploying Large Language Models on Qualcomm Cloud AI 100: Encoder-Decoder Tradeoffs", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which LLM architecture would you choose on this platform for 100-token NLU under 100ms, and how do memory, INT8 compute, and power constraints affect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1952", "title": "Edge NLP Architecture Tradeoffs for Summarization on NVIDIA Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare encoder-only, decoder-only, and encoder-decoder summarizers on Jetson Orin for memory, latency, and power, and choose one?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1953", "title": "Hailo-8 Edge Deployment: Encoder-Decoder Architecture Tradeoffs for NLP Classification", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For Hailo-8 text classification, how would you compare an INT8 MobileBERT-style encoder with a distilled GPT-2-style decoder, and which is preferable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1954", "title": "Encoder-Decoder Tradeoffs on Google Coral Edge TPU for Language Tasks", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How feasible are encoder-only, decoder-only, and encoder-decoder NLU models on the 4 TOPS, 2W, INT8-only Coral Edge TPU, and how would you make one deployable?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1955", "title": "Pruning Techniques and Hardware Alignment on NVIDIA Jetson Orin", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do structured and unstructured pruning differ for an LLM on an edge INT8 accelerator, and what accuracy-speedup tradeoffs do they create?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 0}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1956", "title": "Hailo-8 Pruning Strategy for Efficient Edge Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do pruning sparsity patterns matter on a dataflow architecture, and how would you trade accuracy, speedup, and power for structured vs unstructured pruning?", "chain_ids": ["edge-chain-auto-001-03"], "chain_positions": {"edge-chain-auto-001-03": 0}, "chain_tiers": {"edge-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1957", "title": "Optimizing Edge Vision Models: Structured Pruning for Coral TPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you leverage pruning and sparsity techniques, specifically considering the Coral Edge TPU's architecture and INT8-only support, to achieve the <50ms target without significant accuracy loss?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 3}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1958", "title": "Optimizing Sparse Transformer Inference on Qualcomm Cloud AI 100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you convert 90% unstructured sparsity into hardware-friendly sparsity on the AI 100 to maximize throughput while preserving 99% accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1959", "title": "Optimizing LLM Deployment on Hailo-8 via Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose pruning granularity and sparsity patterns to maximize accelerator utilization and minimize energy consumption?", "chain_ids": ["edge-chain-auto-001-03"], "chain_positions": {"edge-chain-auto-001-03": 1}, "chain_tiers": {"edge-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1960", "title": "Pruning MobileNetV2 for Google Coral Edge TPU Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you prune MobileNetV2 for the INT8-only Coral Edge TPU, and which sparsity type best improves speed and power with minimal accuracy loss?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 1}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1961", "title": "Optimizing Vision Model Latency with Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 70% unstructured pruning fail on Jetson Orin, and what structured pruning pattern would you use to reach 30 FPS?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 2}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1962", "title": "Optimizing LLM Deployment on Hailo-8 via Pruning and Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose structured or unstructured sparsity patterns for an INT8 LLM on Hailo-8, and evaluate accuracy, latency, throughput, and power?", "chain_ids": ["edge-chain-auto-001-03"], "chain_positions": {"edge-chain-auto-001-03": 2}, "chain_tiers": {"edge-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1963", "title": "Optimizing Large Language Models on Qualcomm AI 100: Knowledge Distillation Strategies", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use logit and feature distillation to build an INT8 student for Cloud AI 100, and when is distillation better than pruning?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1964", "title": "Optimizing Edge Inference with Knowledge Distillation on Hailo-8", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill a 90% mAP YOLOv5s teacher into a Hailo-8 student that achieves at least 88% mAP, <10ms latency, and lower power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1965", "title": "Knowledge Distillation for Low-Latency Object Detection on Google Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the maximum INT8 MAC budget for 50ms on the 4 TOPS Coral Edge TPU, and how would distillation preserve accuracy within it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1966", "title": "Optimizing Large Language Models for Edge Deployment on NVIDIA Jetson Orin using Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill the 10B LLM into an INT8 student for the edge device that fits 32GB, stays under 60W, and reaches sub-20ms queries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1967", "title": "Knowledge Distillation on Google Coral Edge TPU: Teacher-Student Model Evaluation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you evaluate the distilled MobileNetV2 on Coral Edge TPU against the ResNet-50 teacher, including accuracy, latency, power, memory, and INT8 effects?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1968", "title": "Knowledge Distillation for Efficient LLM Deployment on Qualcomm Cloud AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use knowledge distillation to deploy an accurate INT8 student LLM on the Cloud AI 100 within 32GB LPDDR4x, 400 TOPS, and 75W limits?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 3}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1969", "title": "Optimizing Vision Models for Hailo-8 with Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use logit and feature distillation to optimize a ResNet-50 detector for Hailo-8, and when is KD better than pruning?", "chain_ids": ["edge-chain-auto-secondary-008-21"], "chain_positions": {"edge-chain-auto-secondary-008-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1970", "title": "Optimizing Memory-Bound Operations on Hailo-8 via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would kernel and operator fusion reduce latency and energy for sequential memory-bound INT8 ops on a dataflow accelerator?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 1}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1971", "title": "Optimizing a Vision Model with Operator Fusion on Google Coral Edge TPU", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse ReLU, BatchNorm, and Quantize after the 1x64x64x128 INT8 Conv2D output to reduce Coral TPU memory I/O and launch overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1972", "title": "Optimizing a Vision Model on Qualcomm Cloud AI 100 with Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse the memory-bound ops before the 2D convolution on Cloud AI 100 to reduce LPDDR4x traffic, kernel launches, latency, and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1973", "title": "Optimizing Edge Inference on Jetson Orin: Kernel Fusion for Memory-Bound Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Would you keep the Jetson Orin post-processing ops as separate CUDA kernels or fuse them into one custom kernel, and what latency benefit would you expect?", "chain_ids": ["edge-chain-auto-025-14"], "chain_positions": {"edge-chain-auto-025-14": 2}, "chain_tiers": {"edge-chain-auto-025-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1974", "title": "Hailo-8: Optimizing Memory-Bound Operations via Kernel Fusion for Edge Deployment", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign this block to maximize throughput and energy efficiency, leveraging the Hailo-8's architectural characteristics?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 3}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1975", "title": "AOT Optimization Bottlenecks on Qualcomm AI 100 for LLM Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might the INT8 AOT-compiled LLM underperform on Cloud AI 100, and how would you optimize the graph for its memory hierarchy and ISA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1976", "title": "Optimizing a Large Language Model for Edge Deployment on NVIDIA Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the PyTorch LLM for efficient real-time execution on Jetson Orin using AOT compilation and graph-level compiler passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1977", "title": "Hailo-8 Inference Optimization with Constant Folding", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the maximum throughput increase for the folded Hailo-8 subgraph, and how much power would the 25% operation reduction save?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1978", "title": "Diagnosing Coral Edge TPU Graph Compilation & Performance Issues", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the INT8 TFLite model failing or falling back on Coral Edge TPU, and how would you fix the graph compilation pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1979", "title": "Optimizing Large Language Model Deployment on Qualcomm Cloud AI 100 via Ahead-of-Time Graph Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an AOT compilation strategy on the Cloud AI 100 to meet sub-10ms latency and 500 batch-1 inferences per second?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1980", "title": "Jetson Orin Deployment: Compiler Optimization for Real-time Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you investigate and improve the graph compilation pipeline to reduce 200ms/token latency and mitigate throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1981", "title": "Optimizing a Large Language Model for Qualcomm Cloud AI 100 via Graph Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compile and optimize the 7B quantized LLM across operator lowering, constant folding, memory, and power?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1982", "title": "Optimizing Operator Scheduling on NVIDIA Jetson Orin", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What operator scheduling strategies should the developer use on Jetson Orin for memory reuse, parallel execution, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1983", "title": "Optimizing CNN Inference on Hailo-8 for Edge Deployment", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule CNN operators to hit 30 FPS while optimizing memory reuse, parallelism, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1984", "title": "Optimizing Conv Layer Memory on Qualcomm Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What peak intermediate memory is required under sequential execution versus ideal layer fusion for the two convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1985", "title": "Optimizing Deep Learning Inference on Jetson Orin: Diagnosing Performance Bottlenecks in Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why the Jetson Orin pipeline reaches only 15 FPS, focusing on execution order, memory reuse, and CPU-GPU transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1986", "title": "Optimizing MobileNetV3 for Hailo-8: Scheduling for Memory, Parallelism, and Fusion", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule MobileNetV3-Large operators on Hailo-8 to meet <5ms latency using memory reuse, parallel branches, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1987", "title": "Optimizing Multi-branch Vision Model Inference on Google Coral Edge TPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize operator execution order on the Coral Edge TPU for the multi-branch INT8 detector to reduce latency, memory, and energy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1988", "title": "Edge AI Accelerator Comparison: Operator Scheduling for BERT Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate the impact of memory reuse, parallel execution, and layer fusion on BERT-tiny latency and throughput across the two accelerators?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1989", "title": "Optimizing Vision Transformer Execution on NVIDIA Jetson Orin", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize ViT operator scheduling on the edge device to meet a 30ms latency target while reducing memory use and staying within 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1990", "title": "Hailo-8 CNN Inference Optimization via Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and optimize Hailo-8 operator scheduling to reach 30 FPS using memory reuse, parallel execution, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1991", "title": "Load Balancing for Dynamic Qualcomm Cloud AI 100 Inference Workloads", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which load balancing algorithm best minimizes re-partitioning and cache invalidation during scaling events, and how does it work?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1992", "title": "Edge Inference Traffic Management on NVIDIA Jetson Orin", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design request routing for a Jetson Orin edge fleet to handle variable model loads, network latency, and the 60W power budget?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1993", "title": "Edge Inference Load Balancing with Hailo-8 Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a request routing and load balancing strategy using consistent hashing and weighted round-robin to handle mixed critical and batch inference requests across Hailo-8 accelerators under failures or thermal throttling?", "chain_ids": ["edge-chain-auto-secondary-006-09"], "chain_positions": {"edge-chain-auto-secondary-006-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1994", "title": "Coral Edge TPU Fleet Sizing and Load Balancing for Real-time Inference", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many inferences per second can one Coral Edge TPU handle, how many TPUs are needed at 70% utilization for 800 IPS, and what total power is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1995", "title": "Edge Inference Load Balancing for Qualcomm Cloud AI 100 Fleet", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design request routing and load balancing for a Qualcomm Cloud AI 100 edge fleet serving diverse real-time LLM and CV workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1996", "title": "Edge ML Inference: Optimizing Load Balancing with Hailo-8 Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which two routing architectures would you compare for the 50-Hailo-8 cluster, and how would you handle spikes to 1000 requests per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1997", "title": "Edge Inference Load Balancing with Coral Edge TPUs", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route and load balance inference frames across thousands of Edge TPUs under variable networks, failures, and INT8-only constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1998", "title": "Optimizing Edge Inference Routing on Qualcomm Cloud AI 100", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you replace round-robin routing for the Cloud AI 100 fleet to reduce latency spikes and uneven utilization in real-time object detection?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1999", "title": "Edge AI Load Balancing & Routing for NVIDIA Jetson Orin Deployments", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design load balancing, request routing, model versioning, dynamic scaling, and fault tolerance for the Jetson Orin drone fleet?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2000", "title": "Google Coral TPU Edge Deployment Strategy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What should the Coral Edge TPU server do for INT8 model loading, lightweight serving, and concurrent requests to achieve low-latency inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2001", "title": "Optimizing Cold Start on Qualcomm Cloud AI 100 for Edge Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you minimize cold start latency for a 20GB LLM on Qualcomm Cloud AI 100 when scaling to zero, while staying within memory and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2002", "title": "Jetson Orin Edge Model Memory Management for Autonomous Vehicles", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many unique secondary models can be pre-loaded while reserving RAM and TOPS for the primary and one active secondary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2003", "title": "Edge Object Detection with Hailo-8: Latency, Throughput, and Power Optimization", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the serving infrastructure and latency budget across data acquisition, pre/post-processing, and inference to meet 70ms end-to-end latency on a Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2004", "title": "Optimizing Real-time Edge Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a model serving strategy to meet the <50ms latency and 100 detections/sec throughput requirements while handling cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2005", "title": "Edge AI Model Serving on Hailo-8 with Dynamic Workloads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a Hailo-8 model serving strategy for multiple variable-load inspection models with sub-50ms critical latency and low cold starts?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2006", "title": "MLOps for INT8 Deployment on Coral Edge TPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What Coral Edge TPU data type constraint must the model artifact satisfy, and what CI/CD practice ensures it before deployment?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 0}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2007", "title": "Edge AI Model Deployment: Latency vs. Throughput on Jetson Orin", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can the 15 TOPS INT8 model meet a 50ms latency target on Jetson Orin, and what CI/CD profiling and MLOps steps would you add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2008", "title": "MLOps for Edge Deployment with Google Coral Edge TPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the MLOps pipeline for FP32-to-INT8 Coral Edge TPU deployments to ensure CI/CD, reproducibility, and training-serving consistency?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 1}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2009", "title": "MLOps for LLM Deployment on Qualcomm Cloud AI 100 Edge Devices", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the MLOps lifecycle for frequent LLM updates on Qualcomm Cloud AI 100 while meeting 32GB memory, INT8, and 75W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2010", "title": "Edge MLOps Performance Regression on Jetson Orin", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the YOLOv8s FPS regression on Jetson Orin and improve MLOps so future CI/CD catches edge performance regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2011", "title": "Edge LLM OTA Update System Design for Qualcomm AI 100", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design OTA updates for 100,000 edge devices with 10GB firmware, <5min downtime, and immediate rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2012", "title": "Optimizing FOTA for NVIDIA Jetson Orin Fleets with A/B Partitions", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a reliable A/B FOTA strategy for remote devices under intermittent networks and power interruptions?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 5}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2013", "title": "Optimizing Object Detection on Hailo-8 with Operator Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify and resolve Hailo-8 operator gaps, delegate unsupported ops, and weigh the delegation trade-offs?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2014", "title": "Optimizing ONNX Model Deployment on Qualcomm Cloud AI 100 with Operator Coverage Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify, diagnose, and resolve ONNX operator gaps and delegate work to maximize throughput on the Qualcomm Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2015", "title": "Diagnosing Suboptimal ONNX Runtime Performance on Jetson Orin", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why the system only reaches 8 FPS with low GPU and high CPU usage, and what fixes would you try?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2016", "title": "Optimizing LLM Deployment on Hailo-8 with Custom Operator Challenges", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you handle unsupported PyTorch operators and fit the LLM within the Hailo-8 latency, compute, and 2.5W power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2017", "title": "Optimizing Vision Model Deployment on Jetson Orin: TensorRT Conversion Challenges", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you find ONNX-to-TensorRT operator gaps and choose between plugins, layer substitution, or CPU fallback to hit sub-20 ms on Jetson Orin?", "chain_ids": ["edge-chain-auto-secondary-006-10"], "chain_positions": {"edge-chain-auto-secondary-006-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2018", "title": "Progressive Rollout Strategy for Edge ML on Qualcomm Cloud AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you roll out the new model progressively with traffic splitting, monitoring, and rollback, given edge fleet constraints?", "chain_ids": ["edge-chain-auto-secondary-011-03"], "chain_positions": {"edge-chain-auto-secondary-011-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2019", "title": "Analyzing Performance Degradation in Edge ML Model Rollouts", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the Jetson Orin canary drop from 60 to 30 FPS and P99 jump from 50 to 150 ms, and which metrics would you correlate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2020", "title": "Edge TPU Model Rollout Strategy with A/B Testing", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement a 48-hour 1% canary rollout of v2 across 10,000 intermittent Coral Edge TPU cameras with A/B comparison and rollback?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2021", "title": "Diagnosing Canary Rollout Failures on Edge AI with Qualcomm Cloud AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the canary group's P99 latency jump from 50 ms to 200 ms and error-rate increase on Qualcomm Cloud AI 100 devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2022", "title": "Edge ML Model Rollout on NVIDIA Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you ensure observability, seamless transitions between models, and robust rollback mechanisms for both model artifacts and configurations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2023", "title": "Hailo-8 Edge Deployment: Canary Release for Object Detection Model", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you run a 10% canary of the new Hailo-8 object detector across 1,000 devices, monitor it, and estimate power impact?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2024", "title": "Evaluating A/B Rollout Strategies for Edge AI Models on Google Coral TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare canary and shadow rollout strategies for Model B on 10,000 Coral Edge TPU cameras, including KPIs, constraints, and rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2025", "title": "Canary Rollout and Optimization for Edge ML", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement a Jetson Orin canary rollout and monitor latency, throughput, power, and rollback triggers for regressions?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2026", "title": "Diagnosing End-to-End Latency in a Multi-Model Edge Pipeline on Hailo-8", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the 250 ms latency in the Hailo-8 two-model video pipeline when CPU utilization is low?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2027", "title": "Real-time Multi-Model Object Analysis on Google Coral Edge TPU", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the Coral Edge TPU compound detector-classifier pipeline to stay under 100 ms per frame and 2W?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2028", "title": "Real-time Anomaly Detection on Jetson Orin: Multi-model RAG Pipeline Optimization", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture, monolithic or microservices, is better for sub-100 ms on Jetson Orin, and what are the latency, memory, and power trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2029", "title": "Optimizing a Multi-Modal RAG Pipeline for Edge Deployment on Hailo-8", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and deploy the Hailo-8 multimodal RAG pipeline to meet <200 ms latency, memory, routing, and 2.5W power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2030", "title": "Optimizing Chained Inference on Google Coral Edge TPU for Real-time Anomaly Detection", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks beyond raw TPU inference explain the 60 ms frame latency, and how would you optimize them to meet the 33 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2031", "title": "Real-time Data Drift Monitoring on Qualcomm Cloud AI 100", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor input data drift on Qualcomm Cloud AI 100 without significantly impacting real-time inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2032", "title": "Diagnosing Performance Degradation on Jetson Orin: Edge Drift Analysis", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you determine whether the accuracy drop is due to data or concept drift and detect it under edge constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2033", "title": "Real-time Data Drift Detection on Google Coral Edge TPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you detect lighting or camera-sensor data drift on Coral Edge TPUs using integer-friendly statistics without uploading raw data?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2034", "title": "Edge AI Drift Detection on Hailo-8", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you detect slow data drift on a production-line detector while minimizing inference overhead and staying within the 2.5W power impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2035", "title": "Detecting and Mitigating Data Drift on Jetson Orin for Edge Reliability", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and quantify data drift on Jetson Orin, mitigate it within resource limits, and measure restored reliability?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2036", "title": "Edge ML Drift Detection on Hailo-8 for Real-time Anomaly Systems", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an on-device Hailo-8 drift monitoring and reliability strategy that accounts for 26 TOPS INT8, 2.5W, and quantization effects?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2037", "title": "Graceful Degradation on Jetson Orin for Autonomous Vehicles", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design graceful degradation for a Jetson Orin perception system that misses its 50 ms deadline under load or sensor degradation?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2038", "title": "Graceful Degradation for Real-time Object Detection on Edge AI (Hailo-8)", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a graceful degradation strategy using ladders, fallbacks, and QoS shedding, distinguishing between fail-operational and fail-safe states?", "chain_ids": ["edge-chain-auto-secondary-011-06"], "chain_positions": {"edge-chain-auto-secondary-011-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2039", "title": "Graceful Degradation for Real-time Drone Navigation on Google Coral Edge TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Coral Edge TPU graceful degradation strategy for drone object detection using fallback models, QoS shedding, and fail-safe modes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2040", "title": "Diagnosing Graceful Degradation on Jetson Orin for Edge ML", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of the performance degradation and the failure of graceful fallback mechanisms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2041", "title": "Graceful Degradation for Edge Defect Detection on Hailo-8", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design degradation ladders, model fallbacks, fail-safe behavior, and QoS shedding for the Hailo-8 defect detector under a 3W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2042", "title": "Graceful Degradation for Edge Object Detection on Google Coral TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design graceful degradation for a Coral Edge TPU industrial detector under stress while respecting INT8-only 2W limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2043", "title": "Designing a Graceful Degradation Strategy for Real-time ML on NVIDIA Jetson Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design fail-operational graceful degradation on Jetson Orin using model fallbacks, degradation ladders, and QoS shedding under load or sensor faults?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2044", "title": "Graceful Degradation on Hailo-8 for Edge Surveillance", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Hailo-8 surveillance degradation strategy with ladders, model fallbacks, QoS shedding, and fail-safe versus fail-operational trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2045", "title": "Ensuring ISO 26262 Compliance on Google Coral Edge TPU for Safety-Critical ML", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect edge safety software for ASIL-B deterministic inference, watchdogs, and self-tests under strict INT8 and 2W power constraints?", "chain_ids": ["edge-chain-auto-secondary-008-01"], "chain_positions": {"edge-chain-auto-secondary-008-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2047", "title": "Designing an ISO 26262 ASIL-B Perception System on Hailo-8", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect an ASIL-B Hailo-8 perception pipeline with deterministic latency, watchdogs, self-tests, and fault responses within 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2048", "title": "Hailo-8 Hardware Security Features for Edge Adversarial Robustness", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What built-in hardware security features should a Hailo-8-class edge AI accelerator use to resist physical attacks and model extraction?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2049", "title": "Adversarial Robustness of INT8 Models on Coral Edge TPU for Security Applications", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does INT8-only inference on the 4 TOPS, 2W Coral Edge TPU affect adversarial patch susceptibility and feasible defense strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2050", "title": "Adversarial Robustness Design for Edge AI on Qualcomm Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you secure and harden an INT8 image classifier against adversarial inputs, extraction, and side channels within edge constraints?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2051", "title": "Quantifying Adversarial Defense Overhead on Edge AI for Real-time Systems", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the maximum extra latency an adversarial defense can add while maintaining a minimum real-time processing rate of 25 FPS, and what edge trade-offs follow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2052", "title": "Diagnosing Intermittent Object Detection Failures on Hailo-8 Due to Adversarial Input", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose whether subtle Hailo-8 INT8 object detection failures are targeted adversarial attacks under 2.5W edge constraints?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2053", "title": "Edge AI Reliability: Adversarial Robustness on Google Coral TPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the anomaly detector to maintain adversarial robustness within the strict INT8, 4 TOPS, and 2W hardware constraints?", "chain_ids": ["edge-chain-auto-secondary-007-20"], "chain_positions": {"edge-chain-auto-secondary-007-20": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2054", "title": "Mitigating Model Extraction on Edge AI with Qualcomm Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a defense strategy that minimizes performance impact while effectively mitigating the extraction risk?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2055", "title": "Mitigating Adversarial Patch Attacks on Hailo-8 Edge Deployments for Autonomous Vehicles", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Hailo-8 defense against physical adversarial patches that maintains 30 FPS within 26 TOPS INT8 and 2.5W?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2056", "title": "Coral Edge TPU Adversarial Defense Performance Bottleneck", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 12 ms defense bottleneck on the Coral Edge TPU, and how would you optimize it to meet the 50 ms latency budget?", "chain_ids": ["edge-chain-auto-secondary-007-20"], "chain_positions": {"edge-chain-auto-secondary-007-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2057", "title": "Model Extraction Attack on Qualcomm Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you mitigate power-side-channel model extraction on the Cloud AI 100 despite secure boot and physical access?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2058", "title": "Jetson Orin Edge Monitoring: Key Telemetry for Performance Degradation", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For basic recall-level monitoring, what specific telemetry metric is crucial to track to quickly detect compute-related performance degradation, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2059", "title": "Hailo-8 Edge Deployment: Diagnosing Performance Degradation through Telemetry", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "As a reliability engineer, specify what additional telemetry you would collect and how you would correlate it to pinpoint why only some Hailo-8 devices show rising latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2060", "title": "Edge TPU Fleet Reliability: MTBF Calculation for 99.9% Uptime", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What MTBF in hours is required to achieve 99.9% uptime per Edge TPU device with a 4-hour MTTR?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2061", "title": "Diagnosing Latency Spikes and Timeouts on Qualcomm Cloud AI 100 for Edge ML", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you approach diagnosing the root cause of this degradation and what specific metrics would you prioritize for investigation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2062", "title": "Real-time Edge TPU Monitoring Strategy", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design monitoring and observability for 1000 Coral Edge TPUs, including telemetry, alerts, MTBF/MTTR, stragglers, and dashboards?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2063", "title": "Diagnosing Edge AI Performance Bottlenecks on Qualcomm AI 100", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose intermittent p99 latency on edge devices, quantify MTBF impact, and estimate improvement from an optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2064", "title": "Edge AI Observability for Autonomous Perception on Jetson Orin", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design monitoring and observability for Jetson Orin autonomous vehicles, covering telemetry, alerts, MTBF/MTTR, stragglers, and dashboards?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2065", "title": "Optimizing Real-time Edge ML Data Pipelines with Coral TPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where are the throughput bottlenecks in the Coral Edge TPU pipeline for 30 FPS, given 5 ms CPU preprocessing, 100 GOP inference, and INT8-only support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2066", "title": "Jetson Orin: Real-time Multi-Stream Video Pipeline Bottleneck Analysis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where would you investigate, and how would you apply the data pipeline equation to diagnose and optimize the system for sustained performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2067", "title": "Real-time Edge ML Data Pipeline Optimization with Hailo-8", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you optimize this video pipeline to avoid 500 ms spikes and buffer overflows under a 5 Mbps uplink budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2068", "title": "Edge TPU Data Pipeline Bottleneck Analysis for Real-time Anomaly Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use the data pipeline equation to diagnose the Coral TPU ETL bottleneck and optimize 10 KB JSON messages at 10 Hz for real time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2069", "title": "Edge Data Quality for Hailo-8 Deployments", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you architect edge-side data quality validation for Hailo-8 metadata so only fresh, valid, accurate data reaches the data lake?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2070", "title": "Edge Data Quality and Anomaly Detection on Coral TPU", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a lightweight INT8 data quality and anomaly pipeline on Coral Edge TPUs without exceeding 2W or harming fault detection latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2071", "title": "Diagnosing Data Quality Degradation on Edge AI Accelerator", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and localize upstream data quality issues causing accuracy loss and latency on a remote Qualcomm Cloud AI 100 edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2072", "title": "Edge Data Integrity Real-time Validation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time data quality system on Jetson Orin that enforces schemas, drift gates, and anomaly checks within 50ms and 60W?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2073", "title": "Edge Data Validation on Hailo-8 for Real-time ML", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a robust data validation pipeline for the 30 FPS video stream while minimizing overhead on the Hailo-8's 2.5W power budget?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2074", "title": "Edge Data Integrity: Coral TPU Architecture Evaluation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Would you perform sensor validation on the Edge TPU, the host CPU, or a hybrid path, and what trade-offs drive that choice?", "chain_ids": ["edge-chain-auto-secondary-009-23"], "chain_positions": {"edge-chain-auto-secondary-009-23": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2075", "title": "Real-time Edge Data Quality on Qualcomm Cloud AI 100", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you implement edge data contracts, schema validation, quality gates, and anomaly detection within the 75W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2076", "title": "Edge Data Quality & Validation for Anomaly Detection on NVIDIA Jetson Orin", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose sensor-data quality degradation and add validation gates on Jetson Orin without breaking its 60W and latency constraints?", "chain_ids": ["edge-chain-auto-secondary-009-24"], "chain_positions": {"edge-chain-auto-secondary-009-24": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2077", "title": "Edge Data Quality and Validation for Critical Hailo-8 Deployments", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time data quality pipeline for a Hailo-8 drone that enforces data contracts and detects sensor anomalies without cloud reliance?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 4}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2078", "title": "Active Learning for Edge Model Adaptation on NVIDIA Jetson Orin", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What edge-friendly model update strategy would you use when raw data cannot be streamed for relabeling over the limited satellite uplink?", "chain_ids": ["edge-chain-auto-secondary-003-22"], "chain_positions": {"edge-chain-auto-secondary-003-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2079", "title": "Edge Active Learning and Bias Mitigation on Hailo-8", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you select Hailo-8 traffic samples for annotation, estimate uncertainty efficiently, and mitigate bias under 26 TOPS INT8 and 2.5W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2080", "title": "Edge AI Dataset Curation for Rare Events on Coral TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate, label, and use edge-selected data to improve rare-class detection on a Coral Edge TPU with intermittent connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2081", "title": "Optimal Labeling Strategy for Edge Deployment on Qualcomm Cloud AI 100", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "At what labeled-sample count does active learning become cheaper than random sampling given $0.50 labels, 30% fewer labels, and $5,000 upfront cost?", "chain_ids": ["edge-chain-auto-secondary-003-25"], "chain_positions": {"edge-chain-auto-secondary-003-25": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2082", "title": "Coral Edge TPU Disease Detection: Active Learning for Rare Disease Data Curation", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design and quantify an active learning loop to improve rare plant disease detection on a Coral Edge TPU under INT8 and annotation-budget constraints?", "chain_ids": ["edge-chain-auto-secondary-003-23"], "chain_positions": {"edge-chain-auto-secondary-003-23": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2083", "title": "Optimizing Edge AI Dataset Curation for Qualcomm Cloud AI 100 Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you compare traditional batch annotation versus active learning for Cloud AI 100 defect detection and choose a curation strategy?", "chain_ids": ["edge-chain-auto-secondary-003-25"], "chain_positions": {"edge-chain-auto-secondary-003-25": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2084", "title": "Optimizing Dataset Curation for Hailo-8 Edge Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the bottleneck in the current data pipeline and quantify the potential improvement of your proposed solution, leveraging the NPU capabilities?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2085", "title": "Real-Time Anomaly Detection on Edge AI Device", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect a 50ms edge anomaly detection pipeline for hundreds of 100 Hz sensors with intermittent cloud connectivity?", "chain_ids": ["edge-chain-auto-secondary-014-11"], "chain_positions": {"edge-chain-auto-secondary-014-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2086", "title": "Real-time Sensor Data Ingestion on NVIDIA Jetson Orin", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What raw sensor bandwidth is required, can the Orin's 68 GB/s effective LPDDR5 handle it, and how would you provision resources for these tasks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2087", "title": "Edge Real-Time Anomaly Detection with Qualcomm Cloud AI 100", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What challenges and design choices matter for real-time processing of 1.2 GB/s camera and LIDAR streams on a Qualcomm Cloud AI 100 edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2088", "title": "Optimizing Real-time Edge Inference on Coral TPU for Defect Detection", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks are limiting the Edge TPU pipeline to 15 FPS, and how would you optimize and measure progress toward 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-014-13"], "chain_positions": {"edge-chain-auto-secondary-014-13": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2089", "title": "Hailo-8 Edge Data Pipeline: Optimizing Storage for Efficient Inference", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L1", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which storage format and compression traits would you choose for 500 MB/s patch ingestion on the Hailo-8, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2090", "title": "Edge TPU Storage: Optimizing Data Formats for Real-time Inference Buffering", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you store and compress 12 hours of inference results at 5 FPS for energy-efficient, reliable local buffering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2091", "title": "Optimizing Data Storage for On-Device Inference on Qualcomm Cloud AI 100", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What storage format strategy would you use for sensor data, intermediate results, and checkpoints on Cloud AI 100 to balance compression, speed, and LPDDR4x use?", "chain_ids": ["edge-chain-auto-secondary-008-34"], "chain_positions": {"edge-chain-auto-secondary-008-34": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2092", "title": "Optimizing Sensor Data Storage on NVIDIA Jetson Orin for Edge ML", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which storage format and compression would you choose on Jetson Orin, and how many days of 10-feature float32 sensor records fit in 64 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2093", "title": "Edge AI Feature Store Optimization for Real-time Inference", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which storage format and compression strategy would you choose for 50MB frames at 100 FPS on the edge accelerator, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2094", "title": "Optimizing Edge ML Data Storage on NVIDIA Jetson Orin for Autonomous Drones", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What two local storage architectures would you compare for 24 hours of 1080p 30fps video and detections on a 256GB Jetson Orin?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2095", "title": "Edge AI Data Storage Optimization on Qualcomm Cloud AI 100", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the local storage pipeline for 72 hours of sensor data on Cloud AI 100 while minimizing I/O and memory overhead?", "chain_ids": ["edge-chain-auto-secondary-008-34"], "chain_positions": {"edge-chain-auto-secondary-008-34": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2096", "title": "Data Pruning for Edge Deployment on Qualcomm Cloud AI 100", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is data pruning, how does it differ from coreset selection, and when would it help deploy on the Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2097", "title": "Edge Anomaly Detection and the Data Wall on Jetson Orin", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do the data wall and poor ICR arise, and what data selection strategy would you use to fix them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2098", "title": "Edge ML with Hailo-8: Data Efficiency for Continuous Adaptation", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you select and process data for continuous learning on Hailo-8 without exceeding the power budget or risking model collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2099", "title": "Data Pruning for Edge TPU Deployment", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantify the potential benefits of such a strategy, specifically targeting the limitations of the Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2100", "title": "Diagnosing Data Wall Challenges on Qualcomm Cloud AI 100 for Edge ML", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data-efficient strategy resolves this 'data wall' while remaining within the strict 32 GB and 75W edge hardware constraints?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2101", "title": "Data-Efficient Continuous Learning for Anomaly Detection on NVIDIA Jetson Orin", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the Jetson Orin data pipeline to avoid the data wall and model collapse during daily on-device fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2102", "title": "Optimizing Data for Edge AI on Hailo-8", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you approach identifying and mitigating this issue, specifically leveraging data efficiency and selection techniques given the Hailo-8's constraints?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2103", "title": "Optimizing Data Efficiency for Edge Deployment on NVIDIA Jetson Orin", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the data bottleneck, use coreset selection or pruning, and quantify gains toward 30 FPS and 90% mAP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2104", "title": "Edge Data Efficiency for Real-time Object Detection on Hailo-8", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use data selection to reduce 1080p 30 FPS drone input while preserving on-device adaptation capabilities?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2105", "title": "Federated Learning on Hailo-8 Edge Devices: Power-Efficient Convergence with Non-IID Data", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do model architecture, local epochs, and aggregation frequency trade off convergence, communication, and energy on Hailo-8 federated learning?", "chain_ids": ["edge-chain-auto-017-02"], "chain_positions": {"edge-chain-auto-017-02": 0}, "chain_tiers": {"edge-chain-auto-017-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2106", "title": "Designing Cross-Device Federated Learning on Google Coral Edge TPUs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design federated averaging on INT8-only Coral Edge TPUs to handle communication limits and non-IID client data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2107", "title": "Optimizing Federated Learning Communication on Edge AI Accelerators", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and reduce the federated learning communication bottleneck for 10M float32 weights from the client devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2108", "title": "Fairness Evaluation on Edge AI: Qualcomm Cloud AI 100", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the root causes of demographic approval disparities on the INT8 credit scoring deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2109", "title": "On-Device Fairness Evaluation with Hailo-8", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design this monitoring mechanism, and what formula tracks the FNR difference between groups?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2110", "title": "Diagnosing Bias in Edge TPU Models for Fairness", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose demographic bias on the INT8-only Coral Edge TPU across data, on-device evaluation, and quantization effects?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2111", "title": "Fairness-Aware Model Deployment on Edge AI Accelerator", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design on-device inference and lightweight fairness monitoring for demographic parity and equalized odds on Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-009-30"], "chain_positions": {"edge-chain-auto-secondary-009-30": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-30": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2112", "title": "Fairness Evaluation on Edge Devices with NVIDIA Jetson Orin", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate and monitor demographic parity for pedestrian detection on the Jetson Orin within compute and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2113", "title": "Fairness Evaluation on Hailo-8 Edge Deployment", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a comprehensive fairness evaluation and mitigation strategy tailored to the computational constraints of the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2114", "title": "Responsible AI Deployment on Edge: Jetson Orin's Predictive Maintenance Bias Analysis", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use model cards, impact assessments, and red-teaming on Jetson Orin to explain facility-specific false positives and misses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2115", "title": "Diagnosing Bias in Edge AI Anomaly Detection on Qualcomm Cloud AI 100", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you investigate the false positives at the new factory using data drift analysis, model cards, and impact assessments?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2116", "title": "Responsible AI on Edge: Real-time Governance for Autonomous Drones", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the responsible AI pipeline to balance real-time guardrails with comprehensive impact assessments under the 60W constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2117", "title": "Responsible AI Evaluation on Edge TPU: Quantized Model Comparison", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare Model A and Model B with model cards, explainability, failure modes, metrics, and guardrails on Coral Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-010-04"], "chain_positions": {"edge-chain-auto-secondary-010-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2118", "title": "Responsible AI Optimization on Edge: Bias Mitigation for Real-time Object Detection on NVIDIA Jetson Orin", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the P99 latency spikes and propose a quantifiable fix that also addresses demographic bias at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2119", "title": "Responsible AI for Edge Autonomous Systems on Hailo-8", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build an edge-native Responsible AI and governance framework for Hailo-8 pedestrian detection in delivery robots?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2120", "title": "INT8 Quantization Impact on Regression Heads", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does INT8 quantization make box regression jittery while classification stays accurate, and what FPS can 65 GOPS/frame achieve at 50% Hailo-8 utilization?", "chain_ids": ["edge-chain-auto-secondary-011-29"], "chain_positions": {"edge-chain-auto-secondary-011-29": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2121", "title": "Queuing Backpressure on Jetson Orin Edge Inference", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 30 FPS Jetson Orin pipeline queue grow with 40ms inference, and what fixes would make it stable?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2122", "title": "Little's Law Memory Sizing for Hailo-8 Inference Buffer", "topic": "queueing-theory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using Little's Law, how much buffer memory is needed for 50 events/s with 15ms processing and 4KB events, and is 256KB SRAM enough?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2123", "title": "Coral Edge TPU Throughput Bound via Little's Law", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Using Little's Law, is 30 inferences/s plausible with 28ms Coral latency, and what mean concurrency L would it require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2125", "title": "Datacenter vs Edge Efficiency Trade-off for Video Analytics", "topic": "edge-cloud-tco", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do cloud streaming and Jetson Orin edge processing compare in total power and 3-year TCO for 1000 traffic cameras?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2128", "title": "Tail Latency from Thermal Throttling on Jetson Orin", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the Jetson Orin P99 latency to jump from 28ms to 95ms at 40°C, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2129", "title": "Interrupt Latency Spikes in Real-Time Edge Inference", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does Hailo-8 ResNet-50 P99 latency hit 40ms during network bursts, and how would you reduce the host-side tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2130", "title": "Coral Edge TPU Queue Depth and Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Coral MobileNetV2 P99 latency rise to 55ms under Raspberry Pi load, and how would you reduce it without blaming USB3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2135", "title": "Roofline Analysis for CNN Inference on Jetson Orin Nano", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where does ResNet-50 conv1 land on the Orin roofline, and is compute or memory bandwidth limiting the 18 TOPS result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2136", "title": "Sliding Window Attention for Long Context on Jetson AGX Orin", "topic": "attention-scaling", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 16K-token sequence, how do full attention and 4096-token SWA compare in KV cache memory usage and per-token decode bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2137", "title": "INT4 Weight-Only Quantization for LLM on Hailo-8", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the W4A8 memory footprint for the 3B LLM on Hailo-8, and how does INT4 dequantization affect throughput?", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 1}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2138", "title": "LPDDR5 vs On-Chip SRAM Trade-offs on Jetson Orin for Real-Time Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For YOLOv8-m at 30fps on Orin NX, what memory access pattern matters and what is the binding constraint?", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 2}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2139", "title": "Micro-Batch Pipeline Scheduling on Jetson Orin for Video Analytics", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If detection takes 15ms and classification 5ms per frame, what throughput and latency result from pipelining them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2140", "title": "Power Budget Allocation for Multi-Model Pipeline on Jetson Orin 60W", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the peak power of this Orin pipeline, and can DLA-offloading detection fit it within a 45W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2141", "title": "Gossip Protocol for Federated Inference on Distributed Jetson Cluster", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 5 nodes syncing a 100MB model over 1GbE, how do ring all-reduce and gossip protocol compare in latency, and what is the relative overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2142", "title": "TensorRT INT8 Calibration and Latency Optimization on Jetson Orin", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you calibrate YOLOv8-l for INT8 on Orin, and what latency-accuracy trade-off should entropy calibration achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2143", "title": "Knowledge Distillation for Compressing ViT to MobileNet on Hailo-8", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With feature-based distillation from ViT-B/16 to MobileNetV3-small on Hailo-8, what top-1 accuracy is realistically achievable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2144", "title": "Thermal Design for Fanless Jetson Orin in IP67 Enclosure", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 40°C ambient with 11°C/W total thermal resistance, what TGP can the sealed Orin NX sustain and will 15W throttle?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2145", "title": "Quantization Error Localization for Accuracy Drop on Coral Edge TPU", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the 2.5% INT8 accuracy drop when layers 12-15 are the most quantization-sensitive on Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2146", "title": "Model Compression Ratio Evaluation for Bandwidth-Limited Edge Deployment", "topic": "pruning-sparsity", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Over 10 Mbps LTE, how do full 50MB OTA updates, weight deltas, and LoRA-style incremental updates compare for the deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2147", "title": "Real-Time Inference Pipeline Specification for Autonomous Drone on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you allocate GPU, DLA, and other compute for the drone's 60fps detection, 10fps segmentation, and 100Hz localization within 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2148", "title": "Model Quantization Selection for Coral Edge TPU with Accuracy Constraint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which INT8 strategy meets >20fps and <5% rank-1 loss, and why should mixed FP16 precision be avoided on this TPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2149", "title": "Depthwise Separable Conv vs Standard Conv Roofline on Jetson Orin DLA", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "On Orin DLA, how do 3x3 standard and depthwise separable 64->64 convs compare in FLOPs, parameters, and roofline position?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2150", "title": "Thermal Runaway Prevention for Sustained AI Workload on Hailo-8", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 55°C ambient and θ_ja=20°C/W, what power can the accelerator sustain and does a 2.5W continuous workload need throttling?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2151", "title": "Knowledge Distillation from Transformer to CNN for Hailo-8", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What accuracy gain can feature distillation from ViT-S to ResNet-18 deliver on Hailo-8, and what representation issues arise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2152", "title": "INT8 Activation Quantization Range Calibration for Object Detection on Jetson Orin", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does YOLOv8-m lose 3.8% mAP in daytime after nighttime-only INT8 calibration, and how would you fix it?", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 0}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2153", "title": "Model Partitioning for Collaborative Inference: Edge+Cloud Split on Jetson Orin", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "With a 5 Mbps uplink, where should you split the 50-layer ResNet-152 between Jetson Orin and cloud to minimize latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2154", "title": "Multi-Frame Batch Accumulation for Throughput on Jetson Orin vs Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What batch size should YOLOv8-s use to maximize throughput while keeping event alerts under 200ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2161", "title": "Depthwise Convolutions on Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing 3×3 convolutions with depthwise separable convolutions cut FLOPs 8× but improve Orin latency only 2×?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2162", "title": "Inference Latency Spikes During OTA", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Cortex-M7 inference latencies spike from 15ms to exactly 55ms during OTA writes to the inactive flash partition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2163", "title": "Thermal Throttling Frame Drops", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the drone pipeline miss its 33.3ms deadline after clocks are halved, even though throttled TOPS still exceed the workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2165", "title": "TFLite INT8 Conversion on Cortex-M4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the INT8 weight size and remaining SRAM tensor arena after converting the 600,000-byte FP32 model for the Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2166", "title": "ONNX Runtime Mobile vs TensorRT on Jetson Orin for Edge Inference", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the portability of ONNX Runtime worth the 12% latency overhead over standalone TensorRT, and how can it be mitigated?", "chain_ids": ["edge-chain-auto-secondary-016-11"], "chain_positions": {"edge-chain-auto-secondary-016-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-11": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2167", "title": "Cross-Compiling CUDA Kernels for Jetson's Ampere GPU vs Cloud Ampere", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the sm_80 CUDA kernel achieve 25% lower throughput than expected on the sm_87 edge GPU despite both being Ampere architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2168", "title": "Portable Model Optimization Pipeline for Multi-Edge Deployment", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a portable deployment pipeline for the same perception model across Orin, Qualcomm SA8650P, and TI TDA4VM?", "chain_ids": ["edge-chain-auto-secondary-016-11"], "chain_positions": {"edge-chain-auto-secondary-016-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2169", "title": "TFLite vs ONNX Runtime Micro for Resource-Constrained Edge Devices", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which runtime yields better throughput per watt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2170", "title": "Handling INT8 Quantization Discrepancies Across Edge Runtimes", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is ORT INT8 quantization 2.3 mAP worse than TensorRT on the same calibration set, and how would you close the gap?", "chain_ids": ["edge-chain-auto-secondary-016-11"], "chain_positions": {"edge-chain-auto-secondary-016-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-11": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2171", "title": "Power-Aware Runtime Selection on Jetson Orin Power Profiles", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should the Orin inference service adapt its model and TensorRT backend as nvpmodel switches from 60W to 15W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2172", "title": "CUDA-to-Vulkan Compute Porting for Cross-Vendor Edge GPUs", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Should you port the custom CUDA inference kernels to Vulkan Compute for Jetson and Intel Arc, or use a higher-level abstraction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2173", "title": "Model Format Conversion Fidelity: PyTorch to CoreML to ONNX Round-Trip", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you debug the 0.05 FP32 output mismatch from PyTorch->CoreML->ONNX, and what conversion flow should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2174", "title": "Heterogeneous Compute Dispatch on Jetson Orin: GPU vs DLA vs CPU", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you dispatch perception, planning, and SLAM across Orin's GPU, DLAs, and ARM CPU within the 60W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2175", "title": "OTA Model Update Portability for Heterogeneous Edge Fleets", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you package and validate an OTA model and post-processing update for the 5000-device heterogeneous fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2181", "title": "Tail Latency on Edge with Thermal Throttling", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you maintain P99 below 25ms within the 60W power budget?", "chain_ids": ["edge-chain-auto-secondary-016-12"], "chain_positions": {"edge-chain-auto-secondary-016-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2182", "title": "Latency Jitter from DVFS on Edge Devices", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you stabilize latency without exceeding the 60W power budget?", "chain_ids": ["edge-chain-auto-secondary-016-12"], "chain_positions": {"edge-chain-auto-secondary-016-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2183", "title": "Priority Preemption for Real-Time Edge Inference", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you prevent the lane model from pushing the pedestrian detector past 15ms while staying within the 60W envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2184", "title": "Deterministic Latency with TensorRT on Orin", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What P99 latency should you expect after converting the PyTorch model to fixed-shape TensorRT, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2185", "title": "Memory Controller Contention on Edge SoCs", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much LPDDR5 bandwidth contention is causing the 40 ms P99 spikes, and how would you restructure preprocessing to reduce them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2186", "title": "Register Pressure on Jetson Orin Ampere SMs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might reducing register usage to 64 registers per thread (causing register spilling) decrease performance despite increasing occupancy?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2187", "title": "Tensor Core Availability on Edge GPUs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 3x3 depthwise conv with 320 channels over 7x7 positions, how many FLOPs is it, and why does this operation fail to utilize Tensor Cores?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2188", "title": "INT8 Quantization and Tensor Core Throughput on Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected INT8 latency for the 22ms FP16 path after accounting for memory, quantization, mixed-precision, and overhead terms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2189", "title": "Storage Format for Edge Inference on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long will 500K individual JPEG reads take from NVMe, and what storage format would meet the 10-minute recalibration budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2190", "title": "FlatBuffers vs Protobuf for Edge Model Serving", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did TFLite choose FlatBuffers, and what performance property would you lose by switching to Protobuf?", "chain_ids": ["edge-chain-auto-secondary-008-32"], "chain_positions": {"edge-chain-auto-secondary-008-32": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2191", "title": "Model Weight Format for Edge Deployment", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does a 200 MB ONNX update take over 50 Mbps, and what update format would cut bandwidth by at least 80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2192", "title": "Sparse Format for Edge Model Weights", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Should you store the weights in a sparse format (CSR or bitmap+values), and how does this affect inference latency vs memory savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2193", "title": "Differential Privacy for Federated Edge Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What clipping norm and noise multiplier do you choose to achieve (ε=5, δ=1e-5) over 100 aggregation rounds, given that each round samples 100 devices?", "chain_ids": ["edge-chain-auto-secondary-017-40"], "chain_positions": {"edge-chain-auto-secondary-017-40": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2194", "title": "Privacy Budget for Continuous Edge Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the moments accountant, what noise multiplier σ is needed, and at what point does the model stop improving because the noise overwhelms the gradient signal?", "chain_ids": ["edge-chain-auto-secondary-017-40"], "chain_positions": {"edge-chain-auto-secondary-017-40": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2195", "title": "Tensor Arena Planning for Jetson Orin Multi-Model Serving", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you plan memory allocation to prevent fragmentation?", "chain_ids": ["edge-chain-auto-secondary-017-43"], "chain_positions": {"edge-chain-auto-secondary-017-43": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2216", "title": "Exporting a model with custom autograd to TensorRT for Jetson Orin deployment", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you resolve the ONNX export failure for the custom autograd NMS, and what is the theoretical compute-bound latency for this 4 GFLOP model on Orin's 275 TOPS engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2217", "title": "Reducing activation memory for on-device fine-tuning on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the FP32 activation memory for a batch=8, 96-channel, 28x28 feature map, and why is 7.2 MB an incorrect estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2218", "title": "Trace-based optimization of computational graph for Jetson Orin deployment", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you optimize the 45 ms PyTorch YOLOv8n model for 30 fps inference on Jetson Orin without gradient overhead?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2219", "title": "Implementing efficient backward pass for continual learning on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you compute the Fisher diagonal over 1,000 samples for a 10M-parameter EWC model efficiently on the edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2220", "title": "Debugging gradient flow through quantization-aware training on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should FakeQuant be implemented so gradients flow during QAT for the INT8 target?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2221", "title": "Memory-efficient inference graph for multi-model pipeline on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you reduce RAM and startup overhead for the three-model Orin pipeline while keeping inference correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2222", "title": "Implementing sparse gradient updates for efficient edge fine-tuning", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement top-k sparse gradient updates for the 20M-parameter transformer to reduce future federated communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2223", "title": "Power-aware training schedule using autograd profiling on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you keep adaptive fine-tuning on Orin within a 25 W budget without hitting the 25 ms thermal-throttled regime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2224", "title": "Federated learning gradient compression with autograd hooks on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you apply PowerSGD so 120 MB gradients can be uploaded over a 1 Mbps link within a federated round?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2225", "title": "Analyzing computational graph overhead for real-time control on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the compute-bound inference time for a 4M-op controller on a 275 TOPS Orin, and what unit error makes it 14.5 µs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2226", "title": "Optimizing memory layout of saved tensors for backward on constrained Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you eliminate CPU-GPU activation offload bottlenecks during backward for the depthwise separable model on Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2227", "title": "Implementing efficient knowledge distillation with autograd on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you reduce activation memory for on-device distillation when the 20M-parameter teacher is frozen?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2228", "title": "Analyzing when torch.compile helps vs hurts on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For a 5M-parameter inference model, what is the correct FLOP estimate and why is 10 GFLOPs a unit error?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2229", "title": "Graph capture stability for streaming inference on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you manage LSTM input and hidden-state buffers when replaying a CUDA graph for streaming audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2230", "title": "Energy-optimal gradient computation schedule for battery-powered edge device", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How many learning steps can you afford per hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2231", "title": "Chiplet Design for Edge Inference Power Constraints", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is a 2x5W chiplet design preferable to one 10W monolithic die for an edge inference accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2232", "title": "NUMA-Aware Memory Allocation on Jetson Orin", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Jetson Orin's unified LPDDR5 architecture differ from MI300X, and how do NUMA-like bandwidth effects appear?", "chain_ids": ["edge-chain-auto-secondary-017-03"], "chain_positions": {"edge-chain-auto-secondary-017-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2233", "title": "Die-to-Die Power Gating at Edge", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What power-gating strategy keeps an ISP/ML chiplet edge device under idle power targets while meeting a <10 ms wake latency?", "chain_ids": ["edge-chain-auto-secondary-017-02"], "chain_positions": {"edge-chain-auto-secondary-017-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2234", "title": "Interposer vs Package-on-Package for Edge Chiplets", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which packaging technology is preferable for a wearable device targeting sub-300mW?", "chain_ids": ["edge-chain-auto-secondary-017-01"], "chain_positions": {"edge-chain-auto-secondary-017-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2235", "title": "Heterogeneous Chiplet ISA Compatibility", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What software stack challenges arise when TFLite must target ARM CPU chiplets plus a custom RISC-V ML accelerator die?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2236", "title": "Continual LoRA Adaptation on Edge Devices", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you update the weekly LoRA adapter on the edge device without catastrophic forgetting of prior robot behaviors?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2237", "title": "LoRA Inference on Jetson Orin: Adapter Fusion Strategy", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Should you merge the LoRA adapter before deployment or apply it dynamically during inference on Jetson Orin?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2238", "title": "PEFT Memory Budgets on Jetson Orin for On-Device Fine-tuning", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What memory budget and training throughput should you expect for on-device LoRA fine-tuning of a 1B model on Jetson Orin?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2239", "title": "Adapter Compression for Bandwidth-Constrained Edge Deployment", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you deliver LoRA adapter updates to 10,000 edge devices over 1 Mbps cellular links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2240", "title": "Adapter Rollback and Version Management at Edge", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you roll back and diagnose a LoRA adapter OTA update when 3% of 10,000 Orin devices show higher error rates?", "chain_ids": ["edge-chain-auto-secondary-016-10"], "chain_positions": {"edge-chain-auto-secondary-016-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2241", "title": "On-Device DP Inference for Medical Wearables", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does adding LDP noise to logits compare with keeping the 500K-parameter ECG classifier entirely on-device?", "chain_ids": ["edge-chain-auto-secondary-017-41"], "chain_positions": {"edge-chain-auto-secondary-017-41": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2242", "title": "DP Noise Calibration for Sensor Fusion on Edge", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did per-feature LDP noise raise false negatives from 2% to 18%, and how would you fix the sensor fusion pipeline?", "chain_ids": ["edge-chain-auto-secondary-017-41"], "chain_positions": {"edge-chain-auto-secondary-017-41": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2243", "title": "Fairness Degradation from Quantization on Edge", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did INT8 PTQ widen the skin-tone FPR gap from 1.8% to 6.2%, and how would you fix the calibration process?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2244", "title": "Impact Assessment for Autonomous Medical Triage on Edge", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What components and quantitative thresholds must the AI impact assessment include before deploying the hospital triage assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2245", "title": "PUE and WUE Optimization for ML Training Infrastructure", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the annual PUE overhead energy waste, water consumption, savings from cloud migration, and break-even PUE/WUE for on-prem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2246", "title": "Embodied Carbon Dominance in TinyML Edge Deployment", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the embodied-versus-operational carbon split, and does reducing the Pi workload from 5W to 2W meaningfully cut total carbon?", "chain_ids": ["edge-chain-auto-secondary-017-53"], "chain_positions": {"edge-chain-auto-secondary-017-53": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-53": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2247", "title": "Idle Power Governance for Always-On Edge Inference", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Quantify the carbon and cost impact of the current idle waste and evaluate the technician's proposal.", "chain_ids": ["edge-chain-auto-secondary-017-53"], "chain_positions": {"edge-chain-auto-secondary-017-53": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-53": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2248", "title": "Jetson Orin Carbon Footprint for Continuous Edge Inference Deployment", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the annual carbon footprint of the 5,000-node edge fleet versus H100 cloud inference, and where is the carbon break-even?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2249", "title": "Jetson Orin Thermal Throttling and Inference Latency Tail", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the afternoon p99 latency spike, how does thermal throttling explain it, and how would you harden the deployment?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2250", "title": "Stochastic Input Variance and P99 Latency on Jetson Orin Vision Pipelines", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the current architecture meet the p99 < 40ms SLA, and what changes would bring it under the deadline?", "chain_ids": ["edge-chain-auto-secondary-016-12"], "chain_positions": {"edge-chain-auto-secondary-016-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2252", "title": "Die-to-Die Bandwidth Adequacy for Edge Vision Pipeline", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is the 50 GB/s die-to-die link the bottleneck for this 4K60 vision pipeline?", "chain_ids": ["edge-chain-auto-secondary-017-02"], "chain_positions": {"edge-chain-auto-secondary-017-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2253", "title": "Power Island Gating Across Chiplet Dies for Edge Battery Life", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What power-gating strategy should you use, and can a 10Wh battery support 72 hours with 8 hours of total NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2254", "title": "POP vs 2.5D Interposer for Edge AI Module Cost", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 500K units/year, how do PoP and 2.5D interposer integration compare on bandwidth and cost?", "chain_ids": ["edge-chain-auto-secondary-017-01"], "chain_positions": {"edge-chain-auto-secondary-017-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2255", "title": "NUMA-Aware Runtime Scheduling on Embedded Chiplet", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you fix the 25% ML inference latency penalty caused by OS thread migrations between the two CPU dies?", "chain_ids": ["edge-chain-auto-secondary-017-03"], "chain_positions": {"edge-chain-auto-secondary-017-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2256", "title": "LoRA Adapter Update Over-the-Air for Edge Devices", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the OTA LoRA adapter update system for 10,000 devices to minimize bandwidth and ensure reliability?", "chain_ids": ["edge-chain-auto-secondary-016-10"], "chain_positions": {"edge-chain-auto-secondary-016-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2257", "title": "On-Device LoRA Inference with INT4 Weight Quantization", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you optimize BF16 LoRA adapter computation to eliminate the 35% latency overhead on the INT4 base model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2258", "title": "Federated LoRA Aggregation for Edge Fleet Personalization", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a communication-efficient federated averaging protocol for LoRA?", "chain_ids": ["edge-chain-auto-secondary-016-10"], "chain_positions": {"edge-chain-auto-secondary-016-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2259", "title": "LoRA Rank Compression via SVD for Memory-Constrained Inference", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is the 3B INT4 model with an r=64 LoRA adapter failing on 4GB RAM, and how would you fix it without degrading accuracy?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2260", "title": "LoRA Fine-Tuning for Domain Shift on Jetson Orin Without GPU", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you configure and run LoRA training on a Jetson Orin NX without a discrete GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2265", "title": "Orin Multi-Modal Fusion Queue Instability", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the stability of the fusion queue over a 10-second operational window.", "visual": {"kind": "svg", "path": "edge-2265.svg", "alt": "A linear chart demonstrating the queue size steadily increasing over the 10 second window because service is slower than arrival.", "caption": "Unstable Queue Growth (ρ > 1)"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2266", "title": "Jetson Orin Zero-Copy Camera Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory pipeline that avoids a bus bottleneck, calculating the LPDDR5 bandwidth consumed if a naive CPU-to-GPU data copy is used vs a zero-copy pointer pass.", "visual": {"kind": "svg", "path": "edge-2266.svg", "alt": "Bar chart comparing 12 GB/s memory bandwidth usage in a naive pipeline vs 6 GB/s with zero-copy.", "caption": "Memory Bandwidth Savings (Zero-Copy)"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2271", "title": "Orin Shared LPDDR5 Deficit", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory QoS (Quality of Service) scheme to resolve the contention and calculate the resulting memory bandwidth deficit.", "visual": {"kind": "svg", "path": "edge-2271.svg", "alt": "Stacked bar chart showing 230 GB/s request vs 204.8 GB/s limit.", "caption": "Memory Bandwidth Contention."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2276", "title": "Hailo-8 Accelerator Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption over a continuous 1-second operational cycle.", "visual": {"kind": "svg", "path": "edge-2276.svg", "alt": "A square-wave timeline chart showing power spikes to 2.5W for 100ms, followed by 0.1W baseline.", "caption": "Accelerator Power Duty Cycle"}, "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2280", "title": "Edge Gateway Latency Bounds", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum allowable processor utilization (\\u03c1) that ensures the average wait time remains at or below the 40ms constraint?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2281", "title": "Edge KV Cache Capacity Sizing", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming a flat 1 MB of KV cache is required per context token, calculate the maximum number of concurrent 1024-token requests the device can sustain.", "visual": {"kind": "svg", "path": "edge-2281.svg", "alt": "A stacked bar chart showing the breakdown of the 32GB RAM into Weights, OS, and KV Cache.", "caption": "Edge Memory Allocation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2282", "title": "Edge 4-Stage Synchronous Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the theoretical latency of a single frame and the steady-state pipeline throughput in FPS.", "visual": {"kind": "svg", "path": "edge-2282.svg", "alt": "A four-stage diagonal Gantt chart showing consecutive 16ms blocks overlapping in a standard pipeline execution pattern.", "caption": "Synchronous Edge Pipeline"}, "chain_ids": ["edge-chain-auto-secondary-017-21"], "chain_positions": {"edge-chain-auto-secondary-017-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2285", "title": "Orin Mesh Bisection Bandwidth", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the bisection bandwidth of this 4-node fully connected mesh network?", "visual": {"kind": "svg", "path": "edge-2285.svg", "alt": "A complete graph of four nodes where every node is connected to every other node forming a cross inside a box.", "caption": "4-Node Fully Connected Mesh"}, "chain_ids": ["edge-chain-auto-secondary-017-05"], "chain_positions": {"edge-chain-auto-secondary-017-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2286", "title": "Hailo-8 Solar Camera", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how modifying the sleep/wake schedule to a 20% active duty cycle allows the Hailo-8 to meet the 0.5W budget assuming negligible sleep power.", "visual": {"kind": "svg", "path": "edge-2286.svg", "alt": "Power consumption over time switching between 2.5W and 0W.", "caption": "Hailo-8 Power Cycling"}, "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2287", "title": "Jetson Shared Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the bandwidth difference and physical separation between the CPU and GPU when accessing the shared memory?", "visual": {"kind": "svg", "path": "edge-2287.svg", "alt": "Bar chart comparing zero PCIe overhead to the shared memory bandwidth.", "caption": "Unified Memory Architecture Bandwidth"}, "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 1}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2288", "title": "Jetson Orin Federated Ring", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Recall the communication steps of a Ring AllReduce algorithm used to synchronize gradients across the 4 nodes.", "visual": {"kind": "svg", "path": "edge-2288.svg", "alt": "Ring topology diagram with 4 Jetson nodes connected in a circle.", "caption": "4-Node Ring Topology"}, "chain_ids": ["edge-chain-auto-secondary-009-19"], "chain_positions": {"edge-chain-auto-secondary-009-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2290", "title": "Accelerator Intersection Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the utilization of the chip using basic queueing theory.", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2292", "title": "Hailo-8 PCIe Gen 3 Limit", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the maximum theoretical bandwidth of the PCIe Gen 3 x4 interface connecting the edge host to the accelerator?", "visual": {"kind": "svg", "path": "edge-2292.svg", "alt": "Bar chart comparing 1 lane vs 4 lanes of PCIe Gen 3 bandwidth.", "caption": "PCIe Gen 3 Bandwidth"}, "chain_ids": ["edge-chain-auto-024-02"], "chain_positions": {"edge-chain-auto-024-02": 0}, "chain_tiers": {"edge-chain-auto-024-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2294", "title": "Edge Video Pipeline Queueing Delay Analysis", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the expected queueing delay utilizing M/M/1 principles given the average arrival and service rates.", "visual": {"kind": "svg", "path": "edge-2294.svg", "alt": "Hockey-stick curve showing exponential growth of queueing delay as utilization approaches 1.0.", "caption": "Queue Length vs Utilization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2296", "title": "Accelerator Model State Reload Time", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the hard downtime in seconds if the reset sequence takes 500ms before memory copying begins.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2297", "title": "Rolling Checkpoints for Drone Tracking", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how writing a compact rolling checkpoint to NVMe limits the Recovery Time Objective (RTO).", "visual": {"kind": "svg", "path": "edge-2297.svg", "alt": "Timeline comparing a long fresh initialization vs a short checkpoint recovery.", "caption": "RTO Reduction via Checkpoint"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2298", "title": "PCIe Collective Communication Efficiency", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why Gather-Compute-Broadcast by the host CPU is more efficient here than a device-to-device Ring AllReduce.", "chain_ids": ["edge-chain-auto-secondary-009-21"], "chain_positions": {"edge-chain-auto-secondary-009-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2299", "title": "GMSL Camera PCIe Bottleneck on Orin", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the total required bandwidth and determine if the PCIe Gen4 x4 link will bottleneck the streams.", "visual": {"kind": "svg", "path": "edge-2299.svg", "alt": "Topology showing 4 cameras feeding into a capture card routed over a PCIe x4 bus to the Orin SoC.", "caption": "Camera to PCIe Topology"}, "chain_ids": ["edge-chain-auto-secondary-017-04"], "chain_positions": {"edge-chain-auto-secondary-017-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2300", "title": "PIR-Triggered Accelerator Power States", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the specific power mode the system and accelerator should maintain prior to the PIR trigger to maximize battery life.", "visual": {"kind": "svg", "path": "edge-2300.svg", "alt": "Timeline showing near-zero sleep power, a PIR trigger event, and an active compute spike.", "caption": "Event-Driven Power Profile"}, "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2301", "title": "Edge Ethernet Transfer Latency", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the transfer latency per frame and determine if the 1 Gbps link is sufficient.", "visual": {"kind": "svg", "path": "edge-2301.svg", "alt": "Simple point to point diagram showing two Orins connected by an Ethernet link.", "caption": "Orin Ethernet Link"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2302", "title": "Jetson Orin Zero-Copy Vision", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory-mapped zero-copy pipeline specification from the camera ISP to the GPU to ensure 4x 1080p 60FPS streams don't bottleneck the 204.8 GB/s LPDDR5 bandwidth.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2303", "title": "Hailo-8 Multi-Model Packing", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the utilization of a single Hailo-8 chip and determine how many additional 1 TOPS pedestrian detection models can fit within the remaining compute budget.", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 2}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2305", "title": "Orin NVJPEG Decoding Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a pipeline utilizing the Orin's hardware NVJPEG decoder and NVMM memory to pass decoded frames directly to TensorRT without CPU intervention.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2306", "title": "Hailo-8 Multi-Tenant SLA", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a model multiplexing architecture that guarantees a strict 30 FPS SLA for the customer counting model while utilizing idle cycles for the batch inventory model.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2308", "title": "Evaluate effective uptime comparing different edge checkpointing intervals", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether a 1-minute checkpoint interval provides a higher effective application throughput compared to a 5-minute interval.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2309", "title": "Apply double-buffering to calculate total image processing latency", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply a double-buffering strategy to calculate the total time required to process a batch of 4 frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2310", "title": "Assess queuing stability and utilization for Hailo-8 defect inspection", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Assess the stability of this queuing system and calculate the expected utilization factor of the accelerator.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2311", "title": "Design spatial tiling to fit BEV features in SRAM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a feature map tiling strategy to execute the detection head without spilling intermediate activations to the slow LPDDR5 memory.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2312", "title": "Develop three-year TCO energy model for delivery robot edge compute", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Develop a 3-year TCO model comparing these two options based purely on continuous energy usage at a cost of $0.20 per kWh.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2313", "title": "Design micro-batching strategy to minimize edge pipeline parallelism bubbles", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a micro-batching strategy to minimize pipeline bubbles, and calculate the transfer time for the first micro-batch if the activation is split into 5 chunks.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2314", "title": "Specify memory mapping technique to eliminate offline voice model latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify a memory management technique to eliminate this 500ms model loading latency.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2315", "title": "Calculate peak power difference between data center and edge accelerators", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the difference in peak power consumption between the two hardware deployment choices (400W vs 2x 60W).", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2316", "title": "Evaluate hardware encoding to resolve disk bandwidth bottlenecks on edge", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether a hardware H.265 compression step before the disk write will resolve the bottleneck, assuming 100:1 compression.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2318", "title": "Analyze INT4 memory savings enabling larger batch sizes on edge", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does quantizing only the model weights to INT4 resolve the out-of-memory error for a batch size of 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2319", "title": "Specify asynchronous network calls to prevent edge inference pipeline blocking", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Specify how asynchronous network calls improve the throughput of the local inference loop.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2327", "title": "Power vs Energy in Edge Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why stepping the Orin down from 60W to 15W mode might not decrease the total battery energy consumed per inference.", "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2329", "title": "Theoretical NPU Execution Time", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical minimum time required for a single inference assuming 100% compute utilization.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2330", "title": "PCIe Bottleneck on Edge Accelerator", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why computation-communication overlap fails to keep the NPU fully utilized in this setup.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2331", "title": "PTQ Accuracy Degradation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the mathematical cause of this accuracy collapse and propose a method to diagnose it.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2335", "title": "Edge Factory Queue Stability", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the NPU utilization, and does the inference queue remain stable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2337", "title": "Edge Orin Memory Bandwidth Limit", "topic": "kv-cache-management", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the peak memory bandwidth required to load the KV-cache for a single decoding step at 20 tokens/sec?", "visual": {"kind": "svg", "path": "edge-2337.svg", "alt": "Horizontal bar showing 60 GB/s utilization against a 204.8 GB/s limit", "caption": "Memory Bandwidth Utilization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2340", "title": "SRAM Tiering for Generation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory tiering strategy that maximizes the utilization of the SRAM during the generation phase.", "visual": {"kind": "svg", "path": "edge-2340.svg", "alt": "Bar chart showing huge 2TB/s SRAM bandwidth vs small 100GB/s DRAM bandwidth", "caption": "Bandwidth Hierarchy Capabilities"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2342", "title": "Orin Ring AllReduce", "topic": "collective-communication", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the realistic Ring AllReduce time given the asymmetric tree topology, and identify whether a 2-D hierarchical AllReduce would beat a flat 4-node ring across the contended cross-rack link.", "visual": {"kind": "svg", "path": "edge-2342.svg", "alt": "Four Jetson Orin nodes arranged in a logical ring topology.", "caption": "Ring AllReduce logical topology across four Orin nodes."}, "chain_ids": ["edge-chain-auto-secondary-009-19"], "chain_positions": {"edge-chain-auto-secondary-009-19": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2343", "title": "Dual Hailo Pipeline FPS", "topic": "pipeline-parallelism", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate end-to-end latency and peak throughput accounting for PCIe transfer and DMA setup overhead.", "visual": {"kind": "svg", "path": "edge-2343.svg", "alt": "Gantt chart showing frames overlapping across Stage 1 and Stage 2.", "caption": "Pipeline parallelism execution schedule."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2345", "title": "Camera PCIe Fanout Topology", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the total stream bandwidth and what topology maps cameras to accelerators to prevent bus contention?", "visual": {"kind": "svg", "path": "edge-2345.svg", "alt": "Hierarchical diagram routing 8 cameras into a central PCIe switch, dividing to 4 Hailo-8 chips.", "caption": "PCIe fanout topology for high-bandwidth camera streams."}, "chain_ids": ["edge-chain-auto-secondary-017-56"], "chain_positions": {"edge-chain-auto-secondary-017-56": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-56": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2346", "title": "Orin Max Batch KV Limits", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the maximum batch size if 24GB of RAM is strictly reserved for the KV cache.", "visual": {"kind": "svg", "path": "edge-2346.svg", "alt": "Bar chart comparing 24GB KV cache capacity to individual 2.14GB request footprints.", "caption": "Memory footprint scaling for KV cache batch size."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2349", "title": "IP Camera Gigabit Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the risk of packet drop on the 1 Gigabit Ethernet port and propose a network-level fix.", "visual": {"kind": "svg", "path": "edge-2349.svg", "alt": "8 cameras funneling into a Gigabit switch pointing to an Orin node, with a bottleneck indicator.", "caption": "Gigabit Ethernet bottleneck for uncompressed video streams."}, "chain_ids": ["edge-chain-auto-secondary-017-56"], "chain_positions": {"edge-chain-auto-secondary-017-56": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-56": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2350", "title": "Drone Navigation RTO", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the Recovery Time Objective (RTO) constraint if the system must resume control within 2 seconds of a transient glitch.", "visual": {"kind": "svg", "path": "edge-2350.svg", "alt": "Timeline showing a power glitch followed by a 2-second recovery window before flight resumes.", "caption": "Drone recovery time objective timeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2351", "title": "PagedAttention Fragmentation", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the reduction in internal memory fragmentation if the KV page size is reduced from 256 tokens to 16 tokens.", "visual": {"kind": "svg", "path": "edge-2351.svg", "alt": "Bar chart contrasting the maximum wasted tokens between 256-token pages and 16-token pages.", "caption": "Maximum internal fragmentation waste per sequence."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2352", "title": "Orin Memory Bandwidth Wall", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the memory bandwidth requirement for this generation speed and determine if the Orin can support it.", "visual": {"kind": "svg", "path": "edge-2352.svg", "alt": "Bar chart showing demanded 280 GB/s bandwidth overshooting the 204.8 GB/s limit of the Orin.", "caption": "Bandwidth limit violation during fast token generation."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2354", "title": "Jetson AGX Orin Queueing Latency Spike", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why latency on the Orin diverges from textbook M/M/1 predictions as utilization approaches 100%.", "visual": {"kind": "svg", "path": "edge-2354.svg", "alt": "A curve showing latency staying low until utilization hits about 80%, after which it spikes upward.", "caption": "M/M/1 Queueing delay demonstrating the hockey stick effect."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2355", "title": "Orin Memory Hierarchy Bandwidth Comparison", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the theoretical maximum memory bandwidth of the AGX Orin's LPDDR5 and compare its order of magnitude to standard L1 cache bandwidth.", "visual": {"kind": "svg", "path": "edge-2355.svg", "alt": "Bar chart comparing L1 Cache, L2 Cache, and LPDDR5 bandwidths.", "caption": "AGX Orin typical memory tier bandwidth comparison."}, "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 0}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2357", "title": "PagedAttention Block Size Fragmentation", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Starting from the fragmentation comparison shown, explain why the fragmentation argument inverts on this iGPU once you account for block-table entries, and recommend a block size.", "visual": {"kind": "svg", "path": "edge-2357.svg", "alt": "Bar chart showing high internal fragmentation for 256-token blocks compared to 16-token blocks.", "caption": "KV Cache Internal Fragmentation Ratio."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2358", "title": "Jetson AGX Orin Duty-Cycle Energy Calculation", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply duty-cycling formulas to calculate the total energy consumed in Joules over exactly 1 hour.", "visual": {"kind": "svg", "path": "edge-2358.svg", "alt": "A step wave oscillating between 15W for brief periods and 5W for long periods.", "caption": "Active vs Idle Power over Time."}, "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2359", "title": "AGX Orin LPDDR5 Effective Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why does practical bandwidth utilization collapse under random memory access compared to contiguous streaming?", "visual": {"kind": "svg", "path": "edge-2359.svg", "alt": "Bar chart comparing sequential 160 GB/s against random access 30 GB/s with a red dashed line at peak 204.8 GB/s.", "caption": "Effective Bandwidth vs Access Pattern."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2360", "title": "Orin NVMe Interconnect Standard", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Identify the primary high-speed interconnect topology standard used by the Jetson AGX Orin to attach local NVMe storage.", "visual": {"kind": "svg", "path": "edge-2360.svg", "alt": "Block diagram showing the Orin SoC connected to NVMe via PCIe Gen4.", "caption": "PCIe Gen4 x4 NVMe Interconnect Topology."}, "chain_ids": ["edge-chain-auto-secondary-017-04"], "chain_positions": {"edge-chain-auto-secondary-017-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2361", "title": "Dual-Tier Edge Checkpoint Formulation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a dual-tier checkpointing mechanism that satisfies both local storage wear limits and narrow remote bandwidth constraints.", "visual": {"kind": "svg", "path": "edge-2361.svg", "alt": "Graph showing small frequent local checkpoints and one large rare cloud checkpoint.", "caption": "Dual-Tier Checkpointing Profile."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2362", "title": "KV Cache Pre-allocation Memory Waste", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the maximum batch size before OOM under static max-length allocation versus dynamic paged allocation?", "visual": {"kind": "svg", "path": "edge-2362.svg", "alt": "Bar chart comparing high memory allocation for Static Max against low usage for dynamic paging.", "caption": "KV Cache Allocation Efficiency: Static vs Paged."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2363", "title": "Orin YOLOv8 Queue Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply the M/D/1 (deterministic-service) formula to compute mean response time per frame, and compare against the M/M/1 prediction. Quantify the latency savings from deterministic service in this realistic GPU-inference setting.", "visual": {"kind": "svg", "path": "edge-2363.svg", "alt": "A hockey-stick curve showing latency skyrocketing as utilization approaches 1.0.", "caption": "M/M/1 Latency vs Utilization."}, "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 2}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2364", "title": "Hailo-8 Camera Fan-in Link", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Compute the aggregate ingress bandwidth for 4 and 6 cameras, identify which stage saturates first, and explain the order of binding constraints.", "visual": {"kind": "svg", "path": "edge-2364.svg", "alt": "Four cameras feeding into a single saturated network switch.", "caption": "4:1 Fan-in network topology causing congestion."}, "chain_ids": ["edge-chain-auto-secondary-017-56"], "chain_positions": {"edge-chain-auto-secondary-017-56": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-56": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2365", "title": "Orin Power Mode Threshold", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the energy consumed per inference in each mode, and which is more energy-efficient?", "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2367", "title": "Hailo-8 Dual Model Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Using the overlapping pipeline as a baseline, what are the actual latency and steady-state throughput of the single-fabric multi-context execution?", "visual": {"kind": "svg", "path": "edge-2367.svg", "alt": "Bubble Gantt chart showing stage 1 (10ms) overlapping with stage 2 (15ms).", "caption": "Pipelined model execution on Edge TPU."}, "chain_ids": ["edge-chain-auto-secondary-017-21"], "chain_positions": {"edge-chain-auto-secondary-017-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2368", "title": "Orin 4K Memory Bandwidth", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the percentage of total memory bandwidth consumed by writing these frames against the LPDDR5 limit (204.8 GB/s).", "visual": {"kind": "svg", "path": "edge-2368.svg", "alt": "Bar chart comparing massive 204.8 GB/s capacity against tiny 1.5 GB/s camera write.", "caption": "Camera write bandwidth vs total LPDDR5 capacity."}, "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 1}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2369", "title": "Hailo-8 SSD RTO Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the Recovery Time Objective (RTO) limit imposed by the SSD speed.", "visual": {"kind": "svg", "path": "edge-2369.svg", "alt": "Timeline showing 2 seconds of recovery time.", "caption": "RTO bound by SSD read speed."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2370", "title": "Edge Multi-Camera Sizing", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate the system's TOPS requirement AND the LPDDR5 memory-bandwidth requirement, then specify whether the system is compute-bound or bandwidth-bound. Identify the minimum Orin power mode (15W, 30W, or 60W) that satisfies both.", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 4}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2371", "title": "M/M/1 Robotics Processing", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the average wait time in the queue, and determine if the total system latency meets a 100ms end-to-end deadline.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2372", "title": "Hailo-8 Zero-Copy Gateway", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Construct an end-to-end multi-process serving architecture that maintains the 2.5W power envelope while moving data from the video decoder to the Hailo engine.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2373", "title": "Priority Queue Multi-Camera Scheduling", "topic": "queueing-theory", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can the scheduling model be optimized to ensure safety-critical cameras bypass FIFO without starving the other 8 cameras?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 4}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2374", "title": "Triton Dynamic Batching on Edge", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a multi-tenant serving architecture using NVIDIA Triton to allocate GPU memory effectively and implement dynamic batching?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 4}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2375", "title": "Hailo-8 Batching vs Latency", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze whether buffering 10 frames into a single batch is more energy-efficient than processing each frame individually as it arrives.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2376", "title": "DETR Attention Tiling Edge", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Analyze the LPDDR5 memory traffic for the self-attention maps and create a tiling strategy to prevent spilling out of the Orin's L2 cache.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2377", "title": "Edge Device Checkpointing Under Variable Power Modes", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the trade-offs of asynchronous versus synchronous local NVMe checkpointing for a 2GB active map state under 204.8 GB/s memory bandwidth constraints.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2378", "title": "Asynchronous Double-Buffering for Edge Bounding Box Tracking", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply a double-buffering scheme to overlap the host's bounding box decoding with the accelerator's INT8 convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2379", "title": "Mixed-Precision Quantization Aware Training for Edge GPUs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a custom quantization-aware training scheme that targets INT8 tensor cores while selectively keeping sensitive depthwise layers in FP16.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2380", "title": "Zero-Copy DMA Pipeline for Accelerated Edge Video Streams", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a hardware-accelerated pipeline specification that offloads decoding to a VPU and uses zero-copy memory buffers.", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 4}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2381", "title": "Dual-Bank Atomic Checkpointing on Edge Storage Media", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate a dual-bank checkpointing mechanism where the Orin writes to an A/B partition on the eMMC over the alternative of simply journaling gradients.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2382", "title": "Overlapping Autoregressive Generation with Wi-Fi Transmission", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify how to overlap the autoregressive INT8 token generation step with the network transmission of previous tokens to minimize inter-token latency.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2383", "title": "Energy Costs of Accelerator Initialization Versus Inference", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the total energy cost per positive detection if the accelerator requires 500ms to boot and load weights over PCIe before it can perform a 20ms inference.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2384", "title": "Hardware Video Decoder Offloading in Edge ML Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Summarize the role of the NVDEC hardware decoder in the data ingestion pipeline before the tensor cores execute the model.", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 0}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2385", "title": "Asynchronous Gradient Clipping for Unstable Edge Connections", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the use of asynchronous gradient clipping and local momentum buffering to maintain training stability when a node reconnects after missing 3 synchronization rounds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2386", "title": "Deterministic M/D/1 Queuing Delay for Edge Camera Streams", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the system as an M/D/1 queue to find the average wait time in the buffer before processing begins.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2389", "title": "Edge Autoregressive Bandwidth", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how to estimate the minimum memory bandwidth required to generate 15 tokens per second for a single user.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2391", "title": "Jetson Multi-Model Triton Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Design a serving architecture to maximize GPU utilization and handle varying frame rates among the 5 concurrent models?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 2}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2392", "title": "Hailo-8 PIR Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy efficiency of using a small PIR motion sensor to trigger the Hailo-8 versus keeping the Hailo-8 continuously active at 1 FPS.", "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 4}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2394", "title": "Multi-Camera Batch Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Identify the serving technique that combines independent frames into a single batch, and quantify the latency-vs-throughput trade-off to meet the 33ms SLA.", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 0}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2396", "title": "Thermal Throttling Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a diagnostic framework to pinpoint the compute bottlenecks and propose an architectural modification to keep the power strictly under 30W.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2398", "title": "INT8 Requantization Fallback", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What quantization-related alignment issue causes a residual connection addition to fail or perform poorly on an integer-only accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2399", "title": "Orin Zero-Copy Cropping", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you handle the stride mismatch and alignment constraints to pass the cropped region to the GPU via zero-copy without falling back to CPU memcpy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2402", "title": "Orin Thermal Throttle Floor for SLA", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Determine whether the thermally-throttled 30W envelope meets the 30 FPS SLA, and recommend whether thermal mitigation or model trimming is the cheaper fix.", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 0}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2404", "title": "Gateway Batch Pipeline Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the maximum effective throughput and utilization of the Orin pipeline to determine if it can sustain the gateway arrival rate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2406", "title": "PCIe vs Compute Bind-Flip Under Thermal Throttle", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the steady-state FPS budget by computing the PCIe-bound and compute-bound FPS in both cold and warm regimes, and identify the binding constraint in each.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2408", "title": "Drone Edge Compute Energy Penalty", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the total extra energy in Joules consumed by executing the mapping operation compared to just remaining in the 15W idle state.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2409", "title": "Industrial Camera LPDDR5 DMA Tax", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Analyze concurrent bus contention by computing aggregate LPDDR5 utilization from camera DMA writes + model weight reads + activation reads. Determine whether any of the three streams will be bandwidth-starved by the others.", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 5}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2410", "title": "Evaluating Flash Storage Endurance for Edge Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the feasibility of this checkpointing strategy regarding storage lifespan and compute interruption, given the hardware specs.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2411", "title": "Calculating Communication Hiding Feasibility on PCIe Edge", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the minimum network transfer time per feature map and determine if the communication can be perfectly hidden behind the computation.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2413", "title": "Specifying Hardware Accelerated CV Pipelines on Edge", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you specify a hardware-accelerated pipeline path to eliminate the CPU bottleneck for this workload?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 2}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2414", "title": "Designing Supercapacitor Emergency State Flushes for Edge", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an emergency checkpointing mechanism utilizing the supercapacitor, calculating if it holds enough energy to flush 50 MB of state to an NVMe drive writing at 500 MB/s.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2415", "title": "Evaluating Unified Memory Asynchronous Transfers on SoC", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why the memory copies might still serialize with computation despite using asynchronous CUDA APIs.", "chain_ids": ["edge-chain-auto-secondary-016-04"], "chain_positions": {"edge-chain-auto-secondary-016-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2416", "title": "Coral TPU Batching Effect on Queue Stability", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether batch-of-1, batch-of-2, or batch-of-4 best satisfies the 60 fps arrival rate at the 80 ms p95 SLA using M/D/1 queueing theory.", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 4}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2417", "title": "Calculating Maximum Framerate from Accelerator TOPS Capacity", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How do you calculate the theoretical maximum framerate the Hailo-8 can achieve, and what is the relationship between TOPS and GOPS?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 0}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2419", "title": "Analyzing Inter-Process Communication Overheads in Video Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does the Python `multiprocessing` queue impact memory bandwidth and latency for this 4K video pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2420", "title": "Transient Queue Buildup from Stochastic Arrivals", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the system require a queue buffer even though the maximum processing rate (40 FPS) safely exceeds the average arrival rate (30 FPS)?", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 1}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2424", "title": "TensorRT PTQ Calibration on Orin with 200 Scenes", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate which TensorRT INT8 calibrator best fits the 200-scene constraint, and what concrete TensorRT mitigations are available before resorting to QAT.", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 2}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2425", "title": "Heuristic Priority Queuing for Edge Vision", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can you apply a dynamic queueing mechanism to minimize overall dropped frames of high-value events without upgrading hardware?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 1}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2426", "title": "Zero-Copy Memory on Unified Edge Devices", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a 'zero-copy' memory buffer, and why does it drastically reduce latency on unified memory architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2427", "title": "Dynamic Batching Inefficiencies in TensorRT", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why dynamically sending a batch size of 3 might result in sub-optimal latency compared to an engine compiled specifically for batch 3.", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 1}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2428", "title": "Symmetric INT8 Quantization Formula", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the standard mathematical formula used to map an FP32 value to a symmetric INT8 representation.", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 0}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2429", "title": "Recovering INT8 Activation Clipping", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a mixed-precision diagnostic strategy to restore mAP without falling below the 20 TOPS throughput requirement.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2430", "title": "D/G/1 Jetson Buffer Limits", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the system's queue dynamics to determine if an infinite buffer is necessary or if a finite drop-policy is required.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2431", "title": "Jetson Zero-Copy Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why standard CPU memory buffers are highly inefficient on this platform and name a better primitive.", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 3}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2432", "title": "Supercapacitor Graceful Shutdown", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain the critical software role of the graceful shutdown checkpoint during this 5-second window, naming the specific subsystems and the canonical order in which the shutdown sequence must execute them.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2433", "title": "TensorRT INT8 Speedup Bound", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical maximum speedup of a purely compute-bound matrix multiplication moving from FP16 to INT8.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2434", "title": "Drop-Oldest Queue Policy", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why implementing a LIFO or drop-oldest queueing policy improves real-time security alerts.", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 0}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2436", "title": "PCIe Double-Buffering for Edge Accelerators", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the transfer latency, and can a 2-frame double buffer fully overlap communication with computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2441", "title": "Diagnosing Attention Collapse in INT8 ViTs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the specific mathematical operation in the transformer architecture that fails under uniform INT8 and propose an architectural precision override.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2442", "title": "Evaluate Accelerator Mismatch for Autoregressive LLMs", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create an end-to-end evaluation of this deployment architecture and identify the fundamental hardware mismatch preventing real-time text generation.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2444", "title": "Orin SSD Wear versus Recovery Time", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-off between checkpointing every 1 minute versus every 10 minutes regarding SSD wear-out time and worst-case recovery compute.", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 3}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2445", "title": "Orin DMA and Inference Pipelining", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an overlapping execution schedule using CUDA streams and DMA engines to maximize total system frames-per-second.", "chain_ids": ["edge-chain-auto-secondary-016-04"], "chain_positions": {"edge-chain-auto-secondary-016-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2447", "title": "Hailo-8 Inference Latency Estimate", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Estimate the single-frame inference latency in milliseconds assuming the compiler achieves a hardware utilization efficiency of precisely 25%.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2448", "title": "Orin Parameter Bandwidth", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the minimum memory bytes read from LPDDR5 strictly to load the weights for one full forward pass if stored in FP16 versus INT8.", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 0}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2449", "title": "Camera Batching Latency Evaluation", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether Batch=1 or Batch=4 minimizes the end-to-end tail latency for the worst-case camera frame.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2457", "title": "Energy Calculation for Drone Collision Avoidance on Hailo-8", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total energy consumed by the Hailo-8 accelerator over 1 minute of active flight.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2459", "title": "Latency Reduction via GPU Unified Memory Data Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how replacing the CPU alignment step with a custom CUDA kernel impacts the overall pipeline latency.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2460", "title": "Memory Bandwidth Limits on Orin Vision", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Evaluate why the Orin might still fail to hit 30 FPS despite having 5.5x the required peak compute TOPS.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2462", "title": "Layer Fusion for Hailo-8 SRAM Savings", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply layer fusion to reduce the peak memory footprint of the activation maps during inference?", "chain_ids": ["edge-chain-auto-024-02"], "chain_positions": {"edge-chain-auto-024-02": 1}, "chain_tiers": {"edge-chain-auto-024-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2463", "title": "Symmetric Zero-Point Quantization on Edge", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why activating symmetric vs. asymmetric quantization impacts the zero-point handling during integer matrix multiplication on edge NPUs.", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 2}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2466", "title": "Cold Boot Power Penalties on Edge Devices", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the power traces showing a 2-second high-power phase before every inference to diagnose the flaw in the duty-cycling logic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2467", "title": "Dynamic Range Quantization for Memory Bandwidth", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply dynamic range quantization to the model and explain which operations still run in FP32.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2469", "title": "Constructing Edge Duty-Cycling Power Budgets", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Construct a duty-cycling strategy that meets a daily budget of 75Wh while processing each trigger with a 5-second inference window.", "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 3}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2472", "title": "Applying Zero-Copy Unified Memory Architectures", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is Zero-Copy advantageous for feeding camera input directly to the GPU in a Unified Memory Architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2475", "title": "Analyzing Memory Bus Contention Between Subsystems", "topic": "communication-computation-overlap", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the impact of a unified memory bottleneck if the DMA controller and the GPU compete for LPDDR5 simultaneously.", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2476", "title": "Compute Allocation Plans for Heterogeneous Models", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a compute resource allocation plan and determine the system's expected utilization.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2478", "title": "Drone Pre-Processing Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the data pipeline bottlenecks between memory, CPU, and GPU to determine why the accelerator is starved.", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 4}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2479", "title": "Dynamic Model Multiplexing", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a dynamic model multiplexing infrastructure that ensures zero downtime during product line switchovers while maintaining the 2.5W power envelope.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2480", "title": "Wildlife Camera Energy Waste", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the total energy consumption per hour and determine the percentage of energy wasted during the idle periods.", "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2481", "title": "Realistic Orin Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply hardware efficiency factors to estimate the realistic maximum frame rate the device can achieve, assuming a 30% hardware utilization rate.", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 2}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2482", "title": "Edge Server Queue Depth", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the utilization of the accelerator and specify the required queue length to hold the average number of queued requests.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2483", "title": "Vision Transformer Memory Spillage", "topic": "memory-hierarchy-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why memory bandwidth bottlenecks a 50-GFLOP Vision Transformer despite the 204.8 GB/s limit, and propose a memory-fusion strategy.", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 4}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2484", "title": "Edge ID Persistence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate a strategy to maintain continuous tracking IDs across reboots without writing high-frequency video data to the SD card.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2485", "title": "CUDA Stream Scheduling", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "edge", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify the scheduling timeline using CUDA streams to achieve maximum throughput, and calculate the time to process 10 batches.", "chain_ids": ["edge-chain-auto-secondary-016-04"], "chain_positions": {"edge-chain-auto-secondary-016-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2487", "title": "Hailo-8 M/D/1 Queuing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Compute the expected queue length assuming an M/D/1 queueing model for the inference requests.", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 0}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2489", "title": "Edge Drone WAL Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How effective is an A/B ping-pong checkpoint strategy combined with write-ahead logging (WAL) for preventing state corruption?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 4}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2493", "title": "PCIe Video Streaming Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze whether the PCIe bidirectional bandwidth becomes a bottleneck for streaming these uncompressed frames to the A100.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2504", "title": "10GbE Pipeline Microbatch Sizing", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the minimal microbatch size m needed to keep the pipeline bubble overhead strictly below 15%.", "chain_ids": ["edge-chain-auto-secondary-017-22"], "chain_positions": {"edge-chain-auto-secondary-017-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2506", "title": "PCIe Arbitration Jitter Buffer", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the minimum asynchronous queue depth required between stages to mathematically hide this jitter.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2508", "title": "BLE Connection Sync Buffering", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a pipelined microbatching strategy to handle the BLE latency without stalling the 10ms compute stages.", "chain_ids": ["edge-chain-auto-secondary-017-22"], "chain_positions": {"edge-chain-auto-secondary-017-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2509", "title": "Wi-Fi 6 UAV AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the minimum transmission time required for the AllReduce, given the half-duplex shared medium constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2510", "title": "ESP-NOW Tree AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the synchronization latency, assuming sequential hops due to the shared wireless medium.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2511", "title": "1GbE Orin AllGather Sync", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the total latency of the Ring AllGather operation across the four nodes?", "chain_ids": ["edge-chain-auto-secondary-009-19"], "chain_positions": {"edge-chain-auto-secondary-009-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2513", "title": "CAN Bus Protocol Framing Penalty", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the effective data transfer time for 3 nodes sequentially sending to 1 central parameter server node.", "chain_ids": ["edge-chain-auto-secondary-009-20"], "chain_positions": {"edge-chain-auto-secondary-009-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2514", "title": "Hierarchical Robot Wi-Fi 6 Sync", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the total execution time for a Local Reduce -> Global AllReduce -> Local Broadcast topology.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2515", "title": "LoRa Mesh Gossip Protocol", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a collective topology that minimizes synchronous barrier stalls under this high-latency, lossy environment.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2518", "title": "NPU Preemption Context Switch", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the total preemption latency bubble injected into the LLM pipeline given a 50 GB/s bandwidth.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2519", "title": "Unified Memory KV Read Bandwidth", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the memory bandwidth utilization consumed purely by the KV cache during generation at 20 tokens per second?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2520", "title": "Prefix Caching TTFT Bubble", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the pipeline bubble (TTFT delay) created by synchronizing the new tokens with the cached state.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2522", "title": "UFS 4.0 Storage KV Offload", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the pipeline stall time introduced by retrieving this offloaded context block.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2523", "title": "Multimodal KV Ring Buffer", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you create a KV cache memory strategy that prevents Unified Memory OOM without causing CPU garbage collection stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2525", "title": "PCIe Switch Oversubscription", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the bandwidth bottleneck and synchronization delay of this concurrent DMA.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2526", "title": "1GbE Star Topology Collision Domain", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the total transfer time, accounting for the collision domain bottleneck.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2527", "title": "RP2040 Dual-Core SRAM Contention", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the synchronization overhead per 1000 audio samples if accesses align perfectly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2528", "title": "CSI-2 Direct vs PCIe Switched Latency", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the added synchronization overhead of the store-and-forward FPGA topology per frame.", "chain_ids": ["edge-chain-auto-secondary-017-04"], "chain_positions": {"edge-chain-auto-secondary-017-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2529", "title": "Jetson Cluster Ring vs Star Broadcast", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the synchronization latency penalty of using a Ring instead of a Star topology for a one-to-all broadcast.", "chain_ids": ["edge-chain-auto-secondary-017-05"], "chain_positions": {"edge-chain-auto-secondary-017-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2531", "title": "PCIe Fabric Ring Sched", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an interconnect topology and collective routing scheme to minimize tail latency, accounting for the PCIe switch overheads and Orin's LPDDR5 bandwidth limits.", "chain_ids": ["edge-chain-auto-secondary-017-05"], "chain_positions": {"edge-chain-auto-secondary-017-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2533", "title": "Demand Paging for Edge Model Deployment", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does mmap with unified memory solve this, and what are the latency implications?", "chain_ids": ["edge-chain-auto-secondary-008-25"], "chain_positions": {"edge-chain-auto-secondary-008-25": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2534", "title": "KV-Cache-Aware Load Balancing at the Edge", "topic": "load-balancing", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are multi-turn conversations slower with round-robin routing across 4 edge nodes, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2536", "title": "Diagnosing Zero Latency Gains from Unstructured Pruning on Coral TPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the 75% unstructured sparsity fail to yield any latency improvements on the Coral Edge TPU, and what architectural characteristic of the accelerator dictates this outcome?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 2}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2540", "title": "The Attention Bandwidth Bottleneck", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the SoC's compute drastically underutilized during the MHA layers, causing a deadline miss despite the low total FLOP count?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 3}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0000", "title": "The HBM vs L1 Latency Gap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is accessing HBM3 memory compared to an L1 register read?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0001", "title": "The Energy Tax of Data Movement", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which operation consumes more energy: performing an FP16 multiply-add or reading the operands from DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0002", "title": "The FP16 Model Footprint", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM does it occupy just to load the weights in FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0003", "title": "The Ridge Point Logic", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the ridge point of this accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0004", "title": "The FP16 vs INT8 Precision Choice", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do we move to 8-bit integers for deployment?", "chain_ids": ["global-chain-auto-secondary-017-13"], "chain_positions": {"global-chain-auto-secondary-017-13": 0}, "chain_tiers": {"global-chain-auto-secondary-017-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0005", "title": "The FLOPS vs Time Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If your GPU has a peak performance of 100 TFLOPS, what is the theoretical minimum time to finish this operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0006", "title": "The Battery Drain Math", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many hours of continuous inference could you theoretically run on this 15 Watt-hour battery?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0007", "title": "The Embedding OOM Screen", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Before you even look at the code, what basic math did you fail to do?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["You forgot to set PyTorch's `max_split_size_mb` configuration.", "The Adam optimizer's momentum states consume 3x the memory of the weights.", "100M embeddings at FP32 (128-dim) requires 51.2GB, which physically exceeds the 16GB VRAM.", "The PCIe Gen3 bus is too slow to transfer the embeddings in time."], "correct_index": 2}}, {"id": "global-0009", "title": "The PCIe Bandwidth Screen", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the theoretical peak bandwidth of a PCIe Gen4 x16 slot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.5 TB/s", "~32 GB/s", "~10 Gbps", "~400 Gbps"], "correct_index": 1}}, {"id": "global-0010", "title": "The PyTorch DataLoader Deadlock", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why will setting `num_workers=0` make the Senior Engineers instantly reject your code for a production environment?", "chain_ids": ["global-chain-auto-secondary-017-23"], "chain_positions": {"global-chain-auto-secondary-017-23": 0}, "chain_tiers": {"global-chain-auto-secondary-017-23": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It forces PyTorch to use FP64 instead of FP32.", "It causes PyTorch to spawn too many zombie processes.", "It forces synchronous data loading on the main thread, starving the GPU.", "It disables the L1 cache on the CPU."], "correct_index": 2}}, {"id": "global-0011", "title": "The Cost of Data Movement", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which operation consumes significantly more energy on a modern accelerator: performing the FP16 multiply-add operation or reading the operands from main memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The FP16 multiply-add consumes about 10x more energy.", "They consume roughly the same amount of energy.", "Reading from main memory consumes ~100x to 1000x more energy.", "Compute consumes more energy only if batch size is exactly 1."], "correct_index": 2}}, {"id": "global-0013", "title": "The Parameter Memory Footprint", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If the weights are stored in FP16, what is the absolute minimum GPU memory required just to hold the weights?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 0}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.5 GB", "~7 GB", "~14 GB", "~28 GB"], "correct_index": 2}}, {"id": "global-0014", "title": "The KV-Cache Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What memory structure is the primary cause of this slowdown?", "chain_ids": ["global-chain-auto-secondary-011-33"], "chain_positions": {"global-chain-auto-secondary-011-33": 0}, "chain_tiers": {"global-chain-auto-secondary-011-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The L1 Instruction Cache", "The Parameter Server", "The Gradient Checkpoint buffer", "The Key-Value (KV) Cache"], "correct_index": 3}}, {"id": "global-0015", "title": "Quantization Basics", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Why is INT8 quantization so popular for deploying models on edge devices like mobile phones?", "chain_ids": ["global-chain-auto-secondary-014-14"], "chain_positions": {"global-chain-auto-secondary-014-14": 0}, "chain_tiers": {"global-chain-auto-secondary-014-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It reduces memory bandwidth pressure by 4x and allows the use of highly energy-efficient integer ALUs.", "It automatically sparsifies the network, dropping 75% of the parameters to save memory.", "It increases the mathematical precision of the final output layer by removing floating-point noise.", "It maps 32-bit floats to 8-bit floats, keeping the same numerical distribution but running faster."], "correct_index": 0}}, {"id": "global-0016", "title": "The Purpose of the Roofline Model", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does this tell you about your system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is CPU-Bound due to slow instruction dispatch.", "The workload is Memory-Bound (Bandwidth constrained).", "The workload is Compute-Bound (ALU constrained).", "The hardware is experiencing thermal throttling on the memory bus."], "correct_index": 1}}, {"id": "global-0017", "title": "Network Topologies", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do we use network topologies like Fat-Tree (Clos) instead of a simple traditional Star or Ring network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It is the only topology supported by PCIe Gen5.", "It provides high, non-blocking bisection bandwidth across the entire cluster.", "It allows GPUs to share a single unified L2 cache.", "It eliminates the need for network switches entirely."], "correct_index": 1}}, {"id": "global-0018", "title": "Data Parallelism vs Model Parallelism", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Which distributed strategy should you use to efficiently scale training across 8 GPUs?", "chain_ids": ["global-chain-auto-secondary-017-58"], "chain_positions": {"global-chain-auto-secondary-017-58": 0}, "chain_tiers": {"global-chain-auto-secondary-017-58": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Tensor Parallelism: Shards the 14GB model into 1.75GB pieces, wasting bandwidth on communication.", "Pipeline Parallelism: Divides the model into 8 stages of 1.75GB each, introducing severe pipeline bubbles.", "Data Parallelism: Replicates the 14GB model 8 times (112GB total), achieving near-linear 8x speedup.", "Expert Parallelism: Routes tokens to 8 different 14GB experts, eliminating communication overhead."], "correct_index": 2}}, {"id": "global-0019", "title": "SRAM vs DRAM Characteristics", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why is SRAM typically used for on-chip buffers instead of DRAM in deep learning accelerators, despite its lower density?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["DRAM has higher latency but significantly higher bandwidth per pin than SRAM.", "SRAM is denser and allows for larger on-chip memory capacity compared to DRAM.", "SRAM provides lower latency and higher bandwidth without the need for periodic refresh cycles, unlike DRAM.", "DRAM is volatile while SRAM is non-volatile, making SRAM better for persistent weights."], "correct_index": 2}}, {"id": "global-0020", "title": "Arithmetic Intensity & Roofline Model", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In the context of the Roofline Model, what does it mean if a specific layer in a neural network has very low arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The operation is compute-bound because 312 TFLOPS / 4 FLOPs = 78 trillion elements per second.", "The operation performs very few computations per byte of memory accessed, meaning it is strictly memory bandwidth bound.", "The operation achieves 312 TFLOPS because 0.5 FLOPs/Byte is less than the 156 FLOPs/Byte ridge point.", "The layer is bounded by the 40MB L2 cache since 8 bytes * 4 = 32 bytes exceeds cache capacity."], "correct_index": 1}}, {"id": "global-0021", "title": "High Bandwidth Memory (HBM) Architecture", "topic": "extreme-quantization", "competency_area": "memory", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does the bandwidth of one HBM3 stack compare to one GDDR6 chip, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Local TinyML detection with event-based transmission (0.14 Wh/day).", "Continuous cloud streaming detection (4.80 Wh/day).", "Local TinyML detection without transmission (4.80 Wh/day).", "Event-based transmission bypassing MCU (0.12 Wh/day)."], "correct_index": 0}}, {"id": "global-0022", "title": "Cache Thrashing & Matrix Operations", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What architectural issue likely causes severe slowdown and high L2 miss rates in a custom CUDA matmul kernel, and how would you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 67MB matrices perfectly fit into the 40MB L2 cache, meaning the misses are a profiling artifact.", "Poor spatial locality is causing cache lines to be evicted before they can be fully utilized; breaking into 4KB tiles resolves this.", "The 4096-thread block size exceeds the L2 capacity of 32 threads, causing automatic bypass.", "The memory controller limits L2 bandwidth to 67MB/s, requiring execution throttling."], "correct_index": 1}}, {"id": "global-0023", "title": "Memory Hierarchy Latency Profiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the correct fastest-to-slowest order of the memory hierarchy when profiling data movement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["L1 Cache -> L2 Cache -> Main Memory (DRAM) -> Solid State Drive (NVMe)", "L1 Cache -> Main Memory (DRAM) -> L2 Cache -> Solid State Drive (NVMe)", "Main Memory (DRAM) -> L1 Cache -> L2 Cache -> Solid State Drive (NVMe)", "L2 Cache -> L1 Cache -> Solid State Drive (NVMe) -> Main Memory (DRAM)"], "correct_index": 0}}, {"id": "global-0025", "title": "Roofline Model Interpretation", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this layer compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 50 TFLOPS is reached.", "Memory-bound, because the layer's arithmetic intensity (50) is less than the hardware's ridge point (100 FLOPs/byte).", "Compute-bound, because 100 TFLOPS is the absolute limiting factor for this layer.", "Memory-bound, because the layer achieves 100 TFLOPS at 50 FLOPs/byte."], "correct_index": 1}}, {"id": "global-0026", "title": "Optimizing Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the most effective way to improve the arithmetic intensity of the bias-plus-ReLU element-wise sequence?", "chain_ids": ["global-chain-auto-secondary-006-30"], "chain_positions": {"global-chain-auto-secondary-006-30": 0}, "chain_tiers": {"global-chain-auto-secondary-006-30": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantize the model to INT8 to reduce the compute time of the element-wise operations.", "Perform kernel fusion to combine the operations into a single kernel, reducing reads and writes to global memory.", "Increase the batch size to artificially inflate the FLOP count without changing memory accesses.", "Move the element-wise operations to the CPU to free up GPU tensor core resources."], "correct_index": 1}}, {"id": "global-0027", "title": "Batch Size and Compute Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does increasing the batch size affect the arithmetic intensity of a linear layer?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 0}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It decreases arithmetic intensity because larger batches require proportionally more memory to store the activations.", "It has no effect on arithmetic intensity since the model weights remain the exact same size regardless of batch.", "It increases arithmetic intensity by reusing the loaded model weights across multiple inputs, amortizing memory access costs.", "It increases arithmetic intensity by physically reducing the total number of FLOPs required to process the data."], "correct_index": 2}}, {"id": "global-0028", "title": "LLM Generation Phase Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does auto-regressive LLM decoding typically suffer from exceptionally low arithmetic intensity?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 1}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A single token is generated per step, requiring the model to load all weights and the entire KV cache from memory just to perform a small matrix-vector multiplication.", "The self-attention mechanism requires complex non-linear operations (like Softmax) that natively have low arithmetic intensity.", "Token generation requires frequent inter-GPU communication, which bottlenecks the compute operations.", "The context window is too small during generation, preventing the GPU from utilizing its tensor cores effectively."], "correct_index": 0}}, {"id": "global-0034", "title": "Post-Training Quantization", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the most likely cause of the severe accuracy drop after int8 post-training quantization on the 256KB-RAM microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 256KB RAM is completely exhausted because INT8 requires 4 bytes per parameter.", "There are extreme outliers in the model's weight or activation distributions, compressing normal values into just 3 distinct quantization levels.", "Int8 quantization mathematically guarantees a 50% accuracy drop on all CNNs without retraining.", "The scale factor of 0.003 is too small for the microcontroller's ALU to process."], "correct_index": 1}}, {"id": "global-0035", "title": "Memory Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Will the 150KB-parameter, 100KB-activation model fit on 512KB Flash and 128KB SRAM, and where should each reside?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, everything can be stored in SRAM since 150KB + 100KB = 250KB, and Flash can act as virtual memory.", "Yes, the model weights should be stored in Flash memory, and the intermediate tensors allocated in SRAM.", "No, because the total size (250KB) exceeds the available SRAM (128KB).", "No, because intermediate tensors must be stored in Flash to prevent data loss on power cycles."], "correct_index": 1}}, {"id": "global-0036", "title": "Structured vs. Unstructured Pruning", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why might 70% unstructured pruning fail to improve latency on a standard mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Unstructured pruning reduces latency by 70% because 70% of multiply-accumulate operations are skipped.", "NPUs require exactly 50% sparsity to trigger bypass logic for multiply-accumulate (MAC) operations.", "Standard NPUs lack specialized hardware to exploit irregular sparsity patterns, making unstructured sparse operations inefficient.", "Unstructured pruning increases latency by exactly 30% due to the O(N) indexing overhead of sparse tensors."], "correct_index": 2}}, {"id": "global-0037", "title": "Minimizing Radio Usage", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which architectural approach will typically result in the lowest overall power consumption?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Local TinyML detection with event-based transmission (0.14 Wh/day).", "Continuous cloud streaming detection (4.80 Wh/day).", "Local TinyML detection without transmission (4.80 Wh/day).", "Event-based transmission bypassing MCU (0.12 Wh/day)."], "correct_index": 0}}, {"id": "global-0038", "title": "Tensor Arena Allocation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does TFLM require a pre-allocated Tensor Arena instead of dynamic allocation during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Dynamic allocation requires a constant connection to a cloud-based memory manager.", "The Tensor Arena allows the model to compress weights at runtime to save Flash memory.", "Pre-allocation avoids memory fragmentation and ensures deterministic memory usage in resource-constrained, bare-metal environments.", "`malloc()` is only supported on 64-bit architectures, while most edge devices are 32-bit."], "correct_index": 2}}, {"id": "global-0039", "title": "The Dynamic DAG Scheduling Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you reduce the 9-second overhead of sequential LLM planning steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0040", "title": "The Compound System Observability Stack", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design observability to detect and diagnose both performance and quality degradation in this 7-stage AI pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0041", "title": "The Agentic Memory Architecture", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design storage tiering, retrieval, and context-window management for this 500K-token coding-agent memory?", "chain_ids": ["global-chain-auto-secondary-017-54"], "chain_positions": {"global-chain-auto-secondary-017-54": 1}, "chain_tiers": {"global-chain-auto-secondary-017-54": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0042", "title": "The Multi-Tenant Vector Isolation Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which RAG indexing architecture should you choose for 500 tenants, and how would you balance cost, latency, isolation, and operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0043", "title": "The Experiment Reproducibility Crisis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you systematically debug why identical-code retraining drops the model from 94% to 91% accuracy?", "chain_ids": ["global-chain-auto-secondary-017-23"], "chain_positions": {"global-chain-auto-secondary-017-23": 1}, "chain_tiers": {"global-chain-auto-secondary-017-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0044", "title": "The 10,000-Experiment Infrastructure Challenge", "topic": "mlops-lifecycle", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design experiment tracking to scale from 10,000 to 100,000 experiments per month without training or query bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0045", "title": "The Full-Stack Constraint Propagation Workflow", "topic": "extreme-quantization", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the ML development workflow that propagates these constraints backward through the entire development lifecycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0046", "title": "The ML CI/CD Pipeline Design", "topic": "mlops-lifecycle", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why is microservice-style CI/CD incomplete for ML, and what ML-native CI/CD pipeline would you design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0047", "title": "The 3x Rule of Backpropagation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the backward pass of linear layers cost about 2x the forward pass, making training about 3x forward compute?", "chain_ids": ["global-chain-auto-secondary-017-61"], "chain_positions": {"global-chain-auto-secondary-017-61": 0}, "chain_tiers": {"global-chain-auto-secondary-017-61": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0048", "title": "Skip Connections as Gradient Highways", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "From a gradient flow perspective, why does y = f(x) + x enable 100+ layer networks when non-residual networks struggle beyond 20 layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0049", "title": "The Critical Batch Size", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Under what condition does doubling batch size halve training time, and when does that linear speedup break down?", "chain_ids": ["global-chain-auto-secondary-014-16"], "chain_positions": {"global-chain-auto-secondary-014-16": 0}, "chain_tiers": {"global-chain-auto-secondary-014-16": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0050", "title": "The Optimizer Memory Tradeoff at Scale", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "When would you consider SGD with momentum for a 175B LLM, and when is Adam's memory overhead justified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0051", "title": "MLPerf Execution Scenarios", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does MLPerf Inference need four separate scenarios, and what system property does each stress?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0052", "title": "The Disaggregated Evaluation Pipeline", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many samples per subgroup are needed to detect a 5-point accuracy gap with 80% power, and how large should the eval set be?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0053", "title": "The DRO Training Cost Multiplier", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much more expensive is fairness-aware training with Group DRO, and is there a cheaper alternative that achieves 80% of the benefit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0054", "title": "MoE Decoding Latency Spike at High Batch Size", "topic": "mixture-of-experts", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the 8x7B top-2 MoE slowing from 15ms/token at batch 1 to 45ms/token at batch 128?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0055", "title": "Continuous Roofline Profiling for Multimodal", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you design this profiling system to decompose the performance limiters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0056", "title": "E2E Edge Benchmarking Under Thermal Limits", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build an end-to-end benchmark and profiler to isolate why P99 latency hits 2.8s after 30s on the 5W edge SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0057", "title": "The CPU Overhead Anomaly", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 7B LLM latency spike with mixed sequence lengths under CUDA graphs, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0058", "title": "The Memory-Bound Custom MLP", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the bottleneck in the CustomGLU MLP with 96% HBM bandwidth utilization, and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0059", "title": "The Phantom Memory Pool", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does CUDA graph capture OOM after eager warmup when the same model fits in eager mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0060", "title": "Tiered KV-Cache Page Size Tradeoffs", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose and manage KV-cache block sizes to minimize HBM fragmentation while keeping PCIe Gen5 prefetches efficient?", "chain_ids": ["global-chain-auto-secondary-014-02"], "chain_positions": {"global-chain-auto-secondary-014-02": 1}, "chain_tiers": {"global-chain-auto-secondary-014-02": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0061", "title": "LLM Serving Arithmetic Intensity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you scale the 30B FP16 LLM on one 80GB A100 from 33 tokens/sec to 500 tokens/sec without hitting KV-cache OOMs?", "chain_ids": ["global-chain-auto-secondary-014-16"], "chain_positions": {"global-chain-auto-secondary-014-16": 2}, "chain_tiers": {"global-chain-auto-secondary-014-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0062", "title": "Mitigating KV Cache Fragmentation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign the static KV cache so the 13B model on a 40GB A100 can serve far more than 8 concurrent requests?", "chain_ids": ["global-chain-auto-secondary-014-02"], "chain_positions": {"global-chain-auto-secondary-014-02": 0}, "chain_tiers": {"global-chain-auto-secondary-014-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0063", "title": "Optimizer State NVMe Offloading", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What causes the 60-second optimizer step for the 70B fine-tune, and how would you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0064", "title": "SRAM Layer Fusion for Edge CNNs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you eliminate the DRAM bottleneck between Conv 3x3, ReLU, and Conv 1x1 on the 4MB-SRAM edge accelerator?", "chain_ids": ["global-chain-auto-secondary-014-15"], "chain_positions": {"global-chain-auto-secondary-014-15": 0}, "chain_tiers": {"global-chain-auto-secondary-014-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0065", "title": "Diagnosing Flat Ring AllReduce Latency", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose the 106 ms synchronization time, and what is the true root cause?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0066", "title": "Root-Causing AllToAll Tail Latency", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 15-20 ms AllToAll tail latencies, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0067", "title": "Petabyte-Scale Near-Duplicate Detection Tradeoffs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design the MinHash+LSH pipeline to deduplicate 10B documents without an O(N^2) shuffle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0068", "title": "Multi-PB Distributed Streaming Dataloader", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the streaming data loader to globally shuffle 20 PB from S3 at 819 GB/s without local NVMe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0069", "title": "Optimizing Checkpoint Frequency via Young-Daly", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What checkpoint interval and architecture would maximize goodput with a 6-hour MTBF and 4-minute synchronous checkpoint?", "chain_ids": ["global-chain-auto-secondary-016-07"], "chain_positions": {"global-chain-auto-secondary-016-07": 1}, "chain_tiers": {"global-chain-auto-secondary-016-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0070", "title": "Asynchronous Multi-Tier Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you reduce the 2.4 TB checkpoint pause from 120 seconds to under 5 seconds while preserving node-failure tolerance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0071", "title": "Fast Recovery via In-Memory Redundancy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design recovery to restart in under 1 minute after a single-node fault instead of reloading 1.5 TB from storage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0072", "title": "Mitigating Network Stragglers in Synchronous Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you detect and mitigate silent 400 Gbps-to-10 Gbps link degradation during a 10,000-GPU run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0073", "title": "Architecting SDC Detection in Large Clusters", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect an SDC detection mechanism that prevents saving corrupted checkpoints without doubling compute costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0074", "title": "Diagnosing InfiniBand Adaptive Routing Loop", "topic": "load-balancing", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the 8 ms All-to-All tail latency with zero drops, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0075", "title": "3D Torus Deadlock and Credit Starvation", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 3D Torus network to freeze at 0 GB/s after a transient link flap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0076", "title": "RoCEv2 PFC Storm Propagation", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can one NIC that stops draining packets freeze the entire 1024-node RoCEv2 subnet within 2 ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0077", "title": "ECMP Hash Collisions with Elephant Flows", "topic": "load-balancing", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are two 400G uplinks saturated while two are idle during the 64-GPU AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0078", "title": "Long-Haul InfiniBand Buffer Credit Exhaustion", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does RDMA over the 20 km 400 Gbps dark-fiber link cap at about 16 Gbps despite zero optical errors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0079", "title": "GPUDirect RDMA NUMA Crossing Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is capping inter-node AllReduce at 120 GB/s on the 8-GPU, 4-NIC dual-socket node?", "chain_ids": ["global-chain-auto-secondary-016-14"], "chain_positions": {"global-chain-auto-secondary-016-14": 1}, "chain_tiers": {"global-chain-auto-secondary-016-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0080", "title": "800G PAM4 Signal Degradation from Dirty Fiber", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the likely cause of 800G OSFP links staying up but spiking to 250 ms latency and 10 Gbps throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0081", "title": "DCQCN Congestion Control Oscillation", "topic": "load-balancing", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 400G-to-5G sawtooth bandwidth oscillation under All-to-All load with zero drops?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0082", "title": "Topology-Unaware Scheduling on Oversubscribed Fabric", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 16-node job get only 66 GB/s AllReduce while contiguous-rack jobs get 200 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0083", "title": "Dragonfly Topology Congestion without Valiant Routing", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Group 1 to Group 5 traffic in the Dragonfly collapse to 40 Gbps per node despite healthy global links?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0084", "title": "In-Network Computing (SHARP) Resource Exhaustion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 21st concurrent job lose SHARP-like 5μs AllReduce latency while the first 20 remain fast?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0085", "title": "PCIe ACS Blocking GPUDirect Peer-to-Peer DMA", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is GPUDirect RDMA 12 \\u03bcs and 30 GB/s even though the GPU and NIC share the same PCIe switch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0086", "title": "Multi-tenant LoRA Serving Architecture", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design the inference architecture to minimize GPU cost while guaranteeing SLA across all tenants?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0087", "title": "Strict SLA Fraud Detection Ensembling", "topic": "queueing-theory", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "If traffic unexpectedly spikes to 80,000 QPS, how do you design the serving system to guarantee the 50ms SLA without dropping transactions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0088", "title": "Tradeoffs in High-Throughput Embedding Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor 256-dimensional embedding drift at 100,000 QPS without adding 40 ms latency or OOMing 2 GB sidecars?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0089", "title": "Safe Deployment for Latency-Sensitive Generation Models", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you continue the 10% canary or switch to shadow deployment for V2 under the 200 ms SLA, and why?", "chain_ids": ["global-chain-auto-secondary-011-33"], "chain_positions": {"global-chain-auto-secondary-011-33": 1}, "chain_tiers": {"global-chain-auto-secondary-011-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0090", "title": "Real-Time Global Toxicity Moderation Fairness Architecture", "topic": "responsible-ai", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you measure and mitigate dialect fairness issues at 100,000 QPS while keeping p99 latency under 50 ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0091", "title": "EU AI Act Compliance Pipeline Storage Design", "topic": "responsible-ai", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the 10-year audit-log pipeline to satisfy EU AI Act evidence needs without violating GDPR minimization?", "chain_ids": ["global-chain-auto-secondary-017-66"], "chain_positions": {"global-chain-auto-secondary-017-66": 1}, "chain_tiers": {"global-chain-auto-secondary-017-66": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0092", "title": "Disaggregated Evaluation Cluster for Foundation Models", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compute weekly fairness metrics over 10B images and 500 cohorts within the $25,000 evaluation budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0093", "title": "Federated Bias Mitigation in Mobile Health Diagnostics", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you preserve equal opportunity across 10 skin-tone groups without demographic data leaving devices or exceeding 200 TB/day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0094", "title": "Low-Latency Multi-Objective Recommendation Debiasing System", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you enforce creator exposure parity in the top-10 recommendations without breaking the 60 ms p99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0095", "title": "FP8 Gradient Casting Overflow Crisis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused this catastrophic failure?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0096", "title": "Cross-Lingual PTQ Activation Outliers", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did quantization break specific languages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0097", "title": "Asymmetric Quantization Throughput Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the smaller INT8 model slower than FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0098", "title": "The QAT Shadow Weight OOM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing this massive memory inflation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0099", "title": "Depthwise Conv Per-Tensor Collapse", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the Depthwise Convolution layers exhibit massive errors under per-tensor INT8 quantization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0100", "title": "Long-Context KV Cache RoPE Failure", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What interaction destroyed the long-context retrieval?", "chain_ids": ["global-chain-auto-secondary-017-30"], "chain_positions": {"global-chain-auto-secondary-017-30": 0}, "chain_tiers": {"global-chain-auto-secondary-017-30": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0101", "title": "The Unfolded BatchNorm Latency Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is there zero latency improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0102", "title": "Serving 70B LLMs on Single Node", "topic": "extreme-quantization", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you architect the deployment of a 70B parameter LLM to meet 96GB memory constraints while maximizing generation throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0103", "title": "Hopper FP8 Format Mismatch", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L2", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should FP8 formats and scaling be chosen to preserve accuracy while meeting the H100 throughput target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0104", "title": "Long-Context VRAM Exhaustion", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing OOM at only 20 concurrent 64k-context requests, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0105", "title": "Recovering Edge Vision Accuracy", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you recover the drone detector's mAP while keeping the 50M model fully INT8 under the 5W NPU limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0106", "title": "Systematic Activation Outliers", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantize the 13B LLM to W8A8 without losing accuracy to 100x activation-channel outliers?", "chain_ids": ["global-chain-auto-secondary-014-14"], "chain_positions": {"global-chain-auto-secondary-014-14": 2}, "chain_tiers": {"global-chain-auto-secondary-014-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0107", "title": "RecSys Network Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the DLRM All-to-All bottleneck and meet the 30ms P99 latency budget without hurting accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0108", "title": "Static Calibration Distribution Shift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did static W8A8 PTQ lose 15 BLEU points on 500-token legal documents, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0109", "title": "Mixed-Precision Fleet Integration", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you deploy one 50B artifact while using FP8 on modern hardware and INT8 on older hardware to hit 50,000 QPS?", "chain_ids": ["global-chain-auto-secondary-017-44"], "chain_positions": {"global-chain-auto-secondary-017-44": 0}, "chain_tiers": {"global-chain-auto-secondary-017-44": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0110", "title": "LLM Decode Batch Size Limits", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using the roofline model, why do latency spike and throughput plateau when Llama-70B batch size reaches 64?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 4}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0111", "title": "The Quantization Roofline Paradox", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did INT8 quantization yield only an 18% latency gain despite 4x higher INT8 TOPS and half the memory footprint?", "chain_ids": ["global-chain-auto-secondary-017-13"], "chain_positions": {"global-chain-auto-secondary-017-13": 2}, "chain_tiers": {"global-chain-auto-secondary-017-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0112", "title": "Recomputation and Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does FlashAttention-2 run 4x faster at 32k context even though it performs about 15% more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0113", "title": "MoE vs Dense Arithmetic Intensity", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the MoE decode throughput 30% lower despite having fewer active parameters per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0114", "title": "DLRM Heterogeneous Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 600GB CTR model at 30,000 QPS, should you use high-bandwidth GPUs, high-capacity CPUs, or a hybrid architecture?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0115", "title": "Kernel Fusion Roofline Shift", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantitatively counter the claim that fusing 1%-FLOP element-wise ops cannot improve inference time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0116", "title": "Diffusion Resolution Scaling Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does doubling diffusion resolution from 512 to 1024 increase denoising latency by 9x instead of 4.5x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0117", "title": "Massive Context KV Cache Paging", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you page the 1M-token KV cache across HBM, host DRAM, and NVMe while sustaining 10 tokens/sec decoding speed?", "chain_ids": ["global-chain-auto-secondary-017-30"], "chain_positions": {"global-chain-auto-secondary-017-30": 1}, "chain_tiers": {"global-chain-auto-secondary-017-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0118", "title": "Long-Running Inference OOM Death", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 12-hour CUDA OOMs despite 45GB theoretical use, and how would you redesign KV memory management?", "chain_ids": ["global-chain-auto-secondary-014-02"], "chain_positions": {"global-chain-auto-secondary-014-02": 2}, "chain_tiers": {"global-chain-auto-secondary-014-02": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0119", "title": "SRAM Tiling for Custom Attention", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you structure registers, shared memory, and HBM access for sliding-window attention at seq_len 131,072?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0120", "title": "MoE Weight Fetching Bottleneck", "topic": "extreme-quantization", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 8x7B MoE stuck at 8 tokens/sec at batch size 1, and how would you reach the 40 tokens/sec SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0121", "title": "High-Density LoRA Adapter Swapping", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect a multi-tier memory hierarchy to serve 10,000 adapters without violating the 200ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0122", "title": "Multi-Tier Embedding Lookups", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect 5TB DLRM embedding lookups across HBM, host RAM, and NVMe for 10,000 QPS under 50ms P99?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0123", "title": "3D CNN Activation Checkpointing", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hybrid recompute/offload strategy would fit 180GB of 3D UNet activations on an 80GB GPU at 1.5 iter/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0124", "title": "Edge LLM Unified Memory Sizing", "topic": "extreme-quantization", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you lay out and quantize the 8B model and 8k KV cache to hit 20 tokens/sec without SSD swapping on 16GB Apple Silicon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0125", "title": "The Operator Fusion Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did fusing the four element-wise ops make the 100M-element FP32 kernel take 150ms instead of the expected 25ms?", "chain_ids": ["global-chain-auto-secondary-014-15"], "chain_positions": {"global-chain-auto-secondary-014-15": 1}, "chain_tiers": {"global-chain-auto-secondary-014-15": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0126", "title": "Depthwise Separable Disappointment", "topic": "roofline-analysis", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing 3x3 convolutions with depthwise separable convolutions slow the edge NPU despite cutting MACs from 2.5G to 0.3G?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0127", "title": "The H100 Speedup Discrepancy", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the H100 upgrade speed up 70B prefill by 3.1x but decode by only 1.6x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0128", "title": "The FlashAttention Paradox", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FlashAttention-2 slow the ViT attention from 0.8ms to 1.2ms at sequence length 196?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0129", "title": "The Power-Clock Anomaly", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does underclocking cores by 40% barely affect Whisper batch-1 throughput while cutting power by 35%?", "chain_ids": ["global-chain-auto-secondary-014-04"], "chain_positions": {"global-chain-auto-secondary-014-04": 2}, "chain_tiers": {"global-chain-auto-secondary-014-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0130", "title": "Mobile NPU Utilization Collapse", "topic": "roofline-analysis", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 4K super-resolution model run at 333ms instead of the 10ms implied by the 10 TOPS NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0131", "title": "The INT4 Quantization Plateau", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT4 weight-only quantization plateau at 12ms/token after INT8 halved Llama-2 13B decode latency?", "chain_ids": ["global-chain-auto-secondary-016-13"], "chain_positions": {"global-chain-auto-secondary-016-13": 1}, "chain_tiers": {"global-chain-auto-secondary-016-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0132", "title": "The KV Cache Roofline Collapse", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does increasing the batch size from 16 to 128 drop attention kernel throughput by 2.5x on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0133", "title": "Continuous Batching Decode Spikes", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 4k-token prompts cause P99 TPOT spikes to 200ms+, and how would you keep TPOT under 50ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0134", "title": "Speculative Decoding Economics", "topic": "speculative-decoding", "competency_area": "optimization", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding with a 7B draft model worsen TPOT to 32ms, and how would you meet the 25ms SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0135", "title": "Paged Attention Internal Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does PagedAttention with 512-token blocks OOM at batch size 128 for 80-token translation requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0136", "title": "Disaggregated Prefill Network Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does disaggregated prefill/decode raise client TTFT to about 400ms over 100 Gbps Ethernet, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0137", "title": "Prompt Caching PCIe Bottleneck", "topic": "compound-ai-systems", "competency_area": "architecture", "track": "global", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does loading the cached 50k-token KV prefix over PCIe Gen4 leave TTFT at 700ms, and how would you meet 500ms?", "chain_ids": ["global-chain-auto-secondary-017-54"], "chain_positions": {"global-chain-auto-secondary-017-54": 0}, "chain_tiers": {"global-chain-auto-secondary-017-54": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0138", "title": "Memory Bandwidth Bound Decode", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is 20ms TPOT impossible for a 70B FP16 model on 2x GPUs, and what architecture could meet it?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 3}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0139", "title": "Continuous Batching Queueing Death Spiral", "topic": "batching-strategies", "competency_area": "latency", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does P99 latency climb past 5 minutes at 2 QPS even though TPOT is flat at 30ms, and how would you fix capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0140", "title": "Long Context Attention FLOPs Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does FlashAttention-2 avoid OOM but still give over 6 minutes TTFT for a 500k-token prefill, and what must change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0141", "title": "The PagedAttention OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 512-token PagedAttention block sizing OOM at 64 short translation requests on an 80GB A100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0142", "title": "The Speculative Decoding Paradox", "topic": "speculative-decoding", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding improve TPOT at 1 QPS but collapse throughput at 50 QPS, and how should you handle it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0143", "title": "Chunked Prefill Latency Spikes", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 4096-token prefill chunks make active Mixtral decode TPOT spike from 30ms to 116ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0144", "title": "The Poisoned Prefix Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the RadixAttention prefix cache hit rate 0% despite reused system prompts, and how would you restore it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0145", "title": "Continuous Batching Generation Deadlock", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the vLLM engine to stall after 10 seconds, and how would you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0146", "title": "The Multi-LoRA Throughput Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does throughput collapse when a batch contains 32 different LoRA adapters despite 100% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0147", "title": "Ring Attention Compute Starvation", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 1M-token Ring Attention prefill stall with 1024-token chunks, and what chunk-size change would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0148", "title": "The QSPI Thrashing Spikes", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does increasing the on-device fine-tuning batch size from 1 to 2 make epoch time jump from 400ms to 3.2s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0149", "title": "The Unaligned Power Drain", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the CMSIS-NN keyword spotting model use more energy per inference even though latency falls from 45ms to 12ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0151", "title": "The Fragmented FL Round", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 30KB gradient buffer allocation fail in round 6 despite 80KB of free heap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0152", "title": "The Strided Cache Miss", "topic": "graph-compilation", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the custom 2D convolution taking 80ms instead of the 15ms suggested by its MAC count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0153", "title": "The Wear-Out Checkpoint", "topic": "federated-learning", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the edge devices to hard fault during weight update serialization after 45 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0154", "title": "The Bus Contention Stall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does inference latency rise from 20ms to 28ms when DMA writes Pong while the CPU processes Ping?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0155", "title": "The I-Cache Thrash Loop", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the unrolled CMSIS-NN kernel halve battery life even though it still meets the 10ms deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0156", "title": "XIP Flash Cache Thrashing", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FL sparsity update make the 200KB keyword spotting model slower and higher power despite 10% fewer FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0157", "title": "PSRAM Strided Access Overhead", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the ESP32-S3 backward pass about 10x slower and thermally worse than the forward pass with the same MAC count?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0158", "title": "DMA and AXI SRAM Bus Contention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the few-microsecond inference jitter and slight power increase when ADC DMA and the TFLite Micro model share AXI SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0160", "title": "Flash Block Erase Write Amplification", "topic": "federated-learning", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the nightly 64KB Flash weight-delta write grow from 120ms to 1.8s after 6 months?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0161", "title": "BLE DMA External RAM Wakeups", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does BLE transmission of the 30KB FL gradient buffer draw 8mA instead of letting the CPU sleep at 2mA radio current?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0162", "title": "SRAM Overlay Thrashing in Backprop", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 3-layer FL backward pass take 2.4s instead of the 250ms predicted by FLOP counts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0163", "title": "The Overclocking Energy Trap", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 15% GPU frequency boost reduce training time but increase total energy by nearly 40%?", "chain_ids": ["global-chain-auto-secondary-014-04"], "chain_positions": {"global-chain-auto-secondary-014-04": 1}, "chain_tiers": {"global-chain-auto-secondary-014-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0164", "title": "The Quantization Cost Paradox", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does switching the 70B model from FP16 on 4 GPUs to W8A8 INT8 on 2 GPUs increase cost per 1K tokens?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0165", "title": "The Federated Battery Drain", "topic": "federated-learning", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does pruning the FL payload from 50MB to 5MB increase battery drain when sync frequency rises from every 100 steps to every 10?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0166", "title": "The Thermal Throttling Ring", "topic": "thermal-management", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 64-node MoE training job lose 25% throughput every day between 2 PM and 6 PM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0167", "title": "The Carbon-Aware Checkpoint Penalty", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did pausing the 256-GPU job during high-carbon periods increase total carbon emissions by 12%?", "chain_ids": ["global-chain-auto-secondary-017-28"], "chain_positions": {"global-chain-auto-secondary-017-28": 0}, "chain_tiers": {"global-chain-auto-secondary-017-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0168", "title": "The PUE Illusion", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the datacenter PUE worsen from 1.15 to 1.30 when switching from dense LLM training to memory-bound MoE?", "chain_ids": ["global-chain-auto-secondary-017-57"], "chain_positions": {"global-chain-auto-secondary-017-57": 0}, "chain_tiers": {"global-chain-auto-secondary-017-57": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0169", "title": "The Power Capping Headroom Paradox", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 200W power cap cause p99 latency to jump above 500ms even though average utilization is 45% and power is 150W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0173", "title": "The Data Residency Constraint", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Under GDPR, what is 'data residency' and why does it constrain where you can place your training infrastructure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0174", "title": "The Grid Emissions Gap", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If you run the same 1,000 GPU-hour training job in each country, how does the carbon footprint differ?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 0}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0175", "title": "The Edge Cache Inference Pattern", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What serving architecture exploits this redundancy to minimize inference costs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0176", "title": "The GPU Cluster Power Wall", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total power draw, and why is securing this amount of power a major infrastructure challenge?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 0}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0178", "title": "The Gradient Staleness Problem", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is this phenomenon called, and why is it harmful?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0179", "title": "The Anycast Routing Pattern", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "With anycast routing, how does the network decide which datacenter handles the request?", "chain_ids": ["global-chain-auto-secondary-011-31"], "chain_positions": {"global-chain-auto-secondary-011-31": 0}, "chain_tiers": {"global-chain-auto-secondary-011-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0180", "title": "The Gradient Sync Penalty", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does a full FP32 gradient AllReduce for the 10B-parameter model take over the 100 Gbps, 60ms RTT cross-DC link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0181", "title": "The CO2 Per Training Run", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total CO2 emissions, in metric tons, for the 256-A100 training run with PUE 1.3 and 400 gCO2/kWh grid intensity?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 1}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0183", "title": "The Electricity Arbitrage", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What annual savings do you get by scheduling the 1,000-GPU batch workload during the 8 off-peak hours instead of peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0184", "title": "The Gradient Compression Dividend", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After Top-1% sparsification with 4-byte values and 4-byte indices, what are the new AllReduce time and effective compression ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0185", "title": "The Model Sync Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long does a full model sync take, and what is the monthly egress cost at $0.08/GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0186", "title": "The Async Pipeline Throughput Limit", "topic": "pipeline-parallelism", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 4 microbatches in flight, what are the pipeline bubble and steady-state throughput across the two 50ms stages and 40ms cross-DC link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0187", "title": "The Right-to-Be-Forgotten Cost", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If retraining costs $2/GPU-hour, what does each deletion request cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0188", "title": "The Follow-the-Sun Savings", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much CO2 does the 48-hour job emit running entirely in Region C versus a 'follow-the-renewables' schedule, and what reduction does that imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0189", "title": "The Time-to-First-Token Across Oceans", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What TTFT will the London and Tokyo users see, do they meet the 200 ms SLA, and what would fix Tokyo if it misses?", "chain_ids": ["global-chain-auto-secondary-011-31"], "chain_positions": {"global-chain-auto-secondary-011-31": 1}, "chain_tiers": {"global-chain-auto-secondary-011-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0191", "title": "The Latency-Carbon-Cost Triangle", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which European region should you deploy in given the 500 ms TTFT SLA, 400 ms prefill, latency, cost, and carbon tradeoffs?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 3}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0192", "title": "The Data Locality Penalty", "topic": "data-efficiency-selection", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach—separate country models, federated learning, or synthetic data—best balances compliance, quality, and patient safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0193", "title": "The Cable Cut Failover", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the impact of the cable cut on Asian inference traffic, and which failover strategy would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0194", "title": "The Green AI Paradox", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does moving the 256-GPU training job from Virginia to Quebec reduce net carbon after data transfer and latency costs?", "chain_ids": ["global-chain-auto-secondary-017-28"], "chain_positions": {"global-chain-auto-secondary-017-28": 1}, "chain_tiers": {"global-chain-auto-secondary-017-28": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0195", "title": "The Inference Placement Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the translation service use one 64-H100 US-East cluster or eight 8-H100 regional clusters to meet the 300 ms P95 latency target?", "chain_ids": ["global-chain-auto-secondary-011-31"], "chain_positions": {"global-chain-auto-secondary-011-31": 2}, "chain_tiers": {"global-chain-auto-secondary-011-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0196", "title": "The GPAI Threshold Debate", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does the 70B model trained on 2,048 GPUs for 90 days exceed the EU AI Act 10^25 FLOP threshold, and what does that imply?", "chain_ids": ["global-chain-auto-secondary-017-66"], "chain_positions": {"global-chain-auto-secondary-017-66": 0}, "chain_tiers": {"global-chain-auto-secondary-017-66": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0197", "title": "The Renewable Intermittency Trap", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you schedule inference and batch training on solar and grid power to minimize cost while keeping inference available 24/7?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 3}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0198", "title": "The Regional Model Routing Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should all users get the same model, or should Asian users get the smaller, faster model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0199", "title": "The Multi-Region Checkpoint Strategy", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate local hourly, local hourly + remote 6h, and remote hourly checkpoint strategies for expected loss and overhead tradeoffs.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0200", "title": "Matrix Multiply Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of the 4096×4096 FP16 matmul, and is it compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0201", "title": "KV Cache Memory for 7B Model Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much KV cache memory does each 4096-token request need, and how many concurrent requests fit alongside the model weights on an 80GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0202", "title": "INT8 Quantization Serving Throughput Gain", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After quantizing the memory-bound 13B FP16 decode model to INT8, what throughput should you expect?", "chain_ids": ["global-chain-auto-secondary-017-13"], "chain_positions": {"global-chain-auto-secondary-017-13": 1}, "chain_tiers": {"global-chain-auto-secondary-017-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0203", "title": "Energy Cost of a Training Run", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total electricity cost for the 64-GPU, 72-hour training run including PUE?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 1}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0204", "title": "Ring AllReduce Communication Time", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long should the 2 GB gradient ring AllReduce take across 8 A100s over NVLink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0205", "title": "PCIe vs NVLink Model Shard Transfer", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 10 GB tensor shard transfer take over PCIe Gen4 x16 versus NVLink 4.0?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0206", "title": "Buy vs Rent GPU Break-Even", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 80% utilization, after how many months does buying an A100-80GB break even versus $2.50/GPU-hour cloud rental?", "chain_ids": ["global-chain-auto-secondary-017-45"], "chain_positions": {"global-chain-auto-secondary-017-45": 1}, "chain_tiers": {"global-chain-auto-secondary-017-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0207", "title": "Self-Attention Arithmetic Intensity During Decode", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of single-token attention with a 2048-token FP16 KV cache, and where does it fall on the A100 roofline?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 2}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0208", "title": "Activation Checkpointing Memory Savings", "topic": "activation-memory", "competency_area": "memory", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much activation memory is needed with no checkpointing versus checkpointing every 4 layers?", "chain_ids": ["global-chain-auto-secondary-017-59"], "chain_positions": {"global-chain-auto-secondary-017-59": 0}, "chain_tiers": {"global-chain-auto-secondary-017-59": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0209", "title": "Continuous Batching Throughput vs Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What system throughput gain and per-request latency change should continuous batching from 1 to 32 requests produce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0210", "title": "Kernel Fusion Memory Bandwidth Savings", "topic": "kernel-fusion", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HBM traffic is saved by fusing the LayerNorm, GELU, and Dropout kernels for the (32, 2048, 4096) FP16 tensor?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 1}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0211", "title": "GPU Rack Power Density Limit", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 8-GPU servers fit in a 20 kW rack, and how many GPUs per rack is that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0212", "title": "Roofline Classification of Elementwise Operations", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of ReLU on 100M FP16 elements, and what maximum TFLOPS can it achieve on an A100?", "chain_ids": ["global-chain-auto-secondary-006-30"], "chain_positions": {"global-chain-auto-secondary-006-30": 1}, "chain_tiers": {"global-chain-auto-secondary-006-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0213", "title": "4-Bit Quantization for Consumer GPU Deployment", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the FP16, INT8, and INT4 memory footprints for the 70B model on a 24 GB GPU, and can it fit at INT4?", "chain_ids": ["global-chain-auto-secondary-016-13"], "chain_positions": {"global-chain-auto-secondary-016-13": 0}, "chain_tiers": {"global-chain-auto-secondary-016-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0214", "title": "Cost Per 1M Tokens Served", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the marginal compute cost per 1M output tokens for the serving 2000 tokens/s at $3.50/hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0215", "title": "Tensor Parallelism Communication Volume", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the AllReduce volume per layer per forward pass under 4-way tensor parallelism, assuming batch=1 and seq_len=2048?", "chain_ids": ["global-chain-auto-secondary-017-58"], "chain_positions": {"global-chain-auto-secondary-017-58": 1}, "chain_tiers": {"global-chain-auto-secondary-017-58": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0216", "title": "Optimal Checkpointing Interval", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes wasted compute for the cluster, and what overhead does it imply?", "chain_ids": ["global-chain-auto-secondary-016-07"], "chain_positions": {"global-chain-auto-secondary-016-07": 0}, "chain_tiers": {"global-chain-auto-secondary-016-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0217", "title": "Mixed-Precision Training Memory Budget", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory is needed for mixed-precision Adam model state for the 3B model, and does it fit on one GPU?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 2}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0218", "title": "Cross-Rack AllReduce Latency Impact", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much slower is a 4 GB AllReduce across two racks over NDR versus within one NVSwitch rack?", "chain_ids": ["global-chain-auto-secondary-016-14"], "chain_positions": {"global-chain-auto-secondary-016-14": 0}, "chain_tiers": {"global-chain-auto-secondary-016-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0220", "title": "Voltage Scaling and Dynamic Power", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the new dynamic power after reducing voltage and frequency by 15%?", "chain_ids": ["global-chain-auto-secondary-014-04"], "chain_positions": {"global-chain-auto-secondary-014-04": 0}, "chain_tiers": {"global-chain-auto-secondary-014-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0221", "title": "H100 vs A100 Roofline Comparison", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the H100 and A100 ridge points, and how much speedup does H100 give at 200 FLOP/byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0223", "title": "torch.compile Warm-Up vs Steady-State", "topic": "graph-compilation", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For a 10,000-step run, when does torch.compile break even and how much total time does it save?", "chain_ids": ["global-chain-auto-secondary-017-62"], "chain_positions": {"global-chain-auto-secondary-017-62": 0}, "chain_tiers": {"global-chain-auto-secondary-017-62": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0224", "title": "Little's Law for GPU Inference Server Sizing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many GPUs are needed to handle 500 requests/s at 200 ms latency with 8 concurrent requests per GPU without queuing?", "chain_ids": ["global-chain-auto-secondary-011-32"], "chain_positions": {"global-chain-auto-secondary-011-32": 0}, "chain_tiers": {"global-chain-auto-secondary-011-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0225", "title": "Diagnosing Prefill-Decode Interference", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What likely causes the p99 token latency spikes when long prefills arrive during short decodes, and how would you mitigate it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0226", "title": "GPU Shows 40% Utilization Despite Full Batch", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might GPU utilization be only 40% despite a fully packed batch, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0227", "title": "OOM at Step 500 but Not Step 1", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What most likely causes the gradual memory climb and OOM at step 500, and how would you debug it?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 3}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0228", "title": "When INT8 Quantization Hurts More Than It Helps", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can INT8 quantization preserve perplexity yet drop RAG factual accuracy from 89% to 71%, and what should you do?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0229", "title": "Pipeline Parallelism Bubble Overhead Analysis", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the bubble overhead and total time for the 4-stage/16-microbatch versus 8-stage/32-microbatch pipelines?", "chain_ids": ["global-chain-auto-secondary-017-64"], "chain_positions": {"global-chain-auto-secondary-017-64": 1}, "chain_tiers": {"global-chain-auto-secondary-017-64": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0230", "title": "NaN Gradients Appearing After Learning Rate Warmup", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What likely causes NaN gradients right after LR warmup in FP16 mixed-precision training, and how would you fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0231", "title": "ECMP Hash Collision Causing Training Slowdown", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What explains deterministic AllReduce slowdowns for the same node pairs when average network use is low but some links are saturated?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0232", "title": "Spot Instance Strategy for Fault-Tolerant Training", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze whether spot instances are cost-effective for a 100-hour training run?", "chain_ids": ["global-chain-auto-secondary-017-46"], "chain_positions": {"global-chain-auto-secondary-017-46": 0}, "chain_tiers": {"global-chain-auto-secondary-017-46": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0233", "title": "Carbon-Aware Scheduling Tradeoff", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the carbon tradeoff of running during Region A's daytime vs Region B anytime, considering a 15% compute overhead from cross-region data transfer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0234", "title": "KV Cache Eviction Under Memory Pressure", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should you choose a KV cache eviction policy under memory pressure for 50 active inference requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0235", "title": "Why FlashAttention is Faster Despite More FLOPs", "topic": "flash-attention", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can FlashAttention run faster than standard attention despite doing extra recomputation FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0236", "title": "Diagnosing Stragglers in Synchronous Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the intermittent stragglers across random GPUs in synchronous training?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0237", "title": "torch.compile Recompilation Storm", "topic": "graph-compilation", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes this compilation storm and how do you fix it?", "chain_ids": ["global-chain-auto-secondary-017-62"], "chain_positions": {"global-chain-auto-secondary-017-62": 1}, "chain_tiers": {"global-chain-auto-secondary-017-62": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0238", "title": "Tail Latency Sources in Multi-Model Serving", "topic": "tail-latency", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do tail latencies compound super-linearly in this sequential pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0239", "title": "Optimal Power Cap for Training Cost Minimization", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the cost-optimal power cap?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 2}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0240", "title": "Optimizing a Memory-Bound Training Bottleneck", "topic": "kernel-fusion", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an optimization strategy to reduce this overhead by at least 50%?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 2}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0241", "title": "Maximizing Concurrent Users on Fixed GPU Budget", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design optimizations to maximize concurrent users within the same hardware and SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0242", "title": "Choosing 3D Parallelism Configuration", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the optimal (TP, PP, DP) configuration?", "chain_ids": ["global-chain-auto-secondary-017-58"], "chain_positions": {"global-chain-auto-secondary-017-58": 2}, "chain_tiers": {"global-chain-auto-secondary-017-58": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0243", "title": "Optimizing Time-to-First-Token for Interactive Chat", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an optimization plan targeting the prefill phase specifically?", "chain_ids": ["global-chain-auto-secondary-010-07"], "chain_positions": {"global-chain-auto-secondary-010-07": 0}, "chain_tiers": {"global-chain-auto-secondary-010-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0244", "title": "Reducing AllReduce Bottleneck in Cross-Rack Training", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design optimizations to reduce communication overhead below 15%?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0245", "title": "Right-Sizing an Inference Fleet for Variable Traffic", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the optimal fleet composition to minimize annual cost while meeting peak demand?", "chain_ids": ["global-chain-auto-secondary-011-32"], "chain_positions": {"global-chain-auto-secondary-011-32": 1}, "chain_tiers": {"global-chain-auto-secondary-011-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0246", "title": "Designing a Mixed-Precision Serving Strategy", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a mixed-precision quantization strategy to meet the 2000 tok/s target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0247", "title": "Optimizing Checkpoint Strategy for 1000-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a checkpointing strategy to reduce waste below 15%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0248", "title": "Optimizing a Compiled Inference Graph for Throughput", "topic": "kernel-fusion", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a compilation optimization strategy for decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0249", "title": "Maximizing Tokens-per-Watt for Sustainable Inference", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an optimization plan to meet the carbon target without reducing throughput?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 2}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0250", "title": "HBM Bandwidth Ceiling on A100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the peak HBM2e memory bandwidth of an A100-80GB GPU, and how does it compare to the H100's HBM3 bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0251", "title": "Arithmetic Intensity Definition", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of a matrix-vector multiply y = Wx where W is 4096x4096 in FP16?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 0}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0252", "title": "Bytes per Parameter Across Precisions", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total VRAM footprint for a 70B parameter model in each format?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0253", "title": "TDP and Energy Cost of GPU Hours", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much energy (in kWh) does a single A100 consume running at full power for 24 hours, and what does this cost at $0.10/kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0254", "title": "NVLink vs PCIe Bandwidth Gap", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the bidirectional bandwidth of NVLink 4.0 vs PCIe Gen5 x16, and why does this gap matter for multi-GPU training?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 0}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0255", "title": "Three Axes of Parallelism", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What are data, tensor, and pipeline parallelism in distributed LLM training, and what does each partition?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 0}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0256", "title": "The A100 Ridge Point", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does this number physically mean?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0257", "title": "GPU-Hour Cost Decomposition", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the approximate cost components (hardware amortization, electricity, cooling, networking, staff) that make up this price?", "chain_ids": ["global-chain-auto-secondary-017-45"], "chain_positions": {"global-chain-auto-secondary-017-45": 0}, "chain_tiers": {"global-chain-auto-secondary-017-45": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0258", "title": "KV Cache Memory per Token", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much KV cache memory is needed per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0259", "title": "Why Kernel Fusion Matters", "topic": "kernel-fusion", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is kernel fusion, why does it improve GPU performance, and what primary bottleneck does it eliminate?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 0}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0260", "title": "Training Memory Budget Breakdown", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why training a 1B parameter model in mixed-precision (FP16 forward/backward + FP32 Adam optimizer) requires approximately 16 GB of memory, not just the 2 GB for FP16 weights?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 1}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0261", "title": "Memory-Bound vs Compute-Bound Intuition", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is batch-1 autoregressive LLM decoding memory-bandwidth-bound while large-batch training is compute-bound, according to the roofline model?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 1}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0262", "title": "AllReduce Communication Overhead", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does this approach O(2x gradient size) as N grows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0263", "title": "Why Activation Outliers Break Quantization", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does a single activation outlier destroy INT8 quantization accuracy, and how do techniques like LLM.int8() address this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0264", "title": "InfiniBand vs Ethernet for Training", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What specific properties of InfiniBand reduce AllReduce latency compared to Ethernet?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 1}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0265", "title": "INT8 vs INT4 for Production LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare INT8 vs INT4 quantization for a 70B model serving 256-token responses under a 200ms SLA. Which do you choose and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0266", "title": "Tensor vs Pipeline Parallelism for 70B", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Compare TP=8 (tensor parallel across all 8) vs PP=8 (pipeline parallel, 8 stages). Which is better for this configuration and why?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 2}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0267", "title": "Spot vs On-Demand for Long Training Runs", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate: spot, on-demand, or a hybrid approach?", "chain_ids": ["global-chain-auto-secondary-017-46"], "chain_positions": {"global-chain-auto-secondary-017-46": 1}, "chain_tiers": {"global-chain-auto-secondary-017-46": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0268", "title": "Static vs Continuous Batching for LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you quantify the throughput difference for a batch of 32 requests where lengths range from 50 to 1500 tokens?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0269", "title": "A100 vs H100 Performance per Watt", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many PFLOPS can you achieve with A100s versus H100s in this 2MW facility?", "chain_ids": ["global-chain-auto-secondary-017-57"], "chain_positions": {"global-chain-auto-secondary-017-57": 1}, "chain_tiers": {"global-chain-auto-secondary-017-57": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0270", "title": "CPU Offloading vs Activation Recomputation", "topic": "activation-memory", "competency_area": "memory", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which is faster for fitting activations on one A100: CPU offloading over PCIe or activation recomputation?", "chain_ids": ["global-chain-auto-secondary-017-59"], "chain_positions": {"global-chain-auto-secondary-017-59": 2}, "chain_tiers": {"global-chain-auto-secondary-017-59": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0271", "title": "Tensor Parallelism Within vs Across Nodes", "topic": "interconnect-topology", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you quantify the communication overhead of each approach?", "chain_ids": ["global-chain-auto-secondary-017-26"], "chain_positions": {"global-chain-auto-secondary-017-26": 1}, "chain_tiers": {"global-chain-auto-secondary-017-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0272", "title": "Prefill Chunking vs Monolithic Prefill", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the latency and throughput tradeoffs between monolithic and chunked prefill?", "chain_ids": ["global-chain-auto-secondary-010-07"], "chain_positions": {"global-chain-auto-secondary-010-07": 1}, "chain_tiers": {"global-chain-auto-secondary-010-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0273", "title": "Eager vs Compiled Execution for Inference", "topic": "kernel-fusion", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate when compilation pays for itself and identify scenarios where it doesn't?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 3}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0274", "title": "GPU vs CPU Data Preprocessing for Training", "topic": "mlops-lifecycle", "competency_area": "data", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compare doubling CPU cores, GPU preprocessing, and offline preprocessed shards for this training pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0275", "title": "On-Prem vs Cloud GPU Cluster Economics", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At what utilization does an on-prem deployment break even compared to cloud renting?", "chain_ids": ["global-chain-auto-secondary-017-45"], "chain_positions": {"global-chain-auto-secondary-017-45": 2}, "chain_tiers": {"global-chain-auto-secondary-017-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0276", "title": "Latency Optimization vs Throughput Optimization", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate which configuration is better for (a) a real-time chatbot and (b) a batch document summarization pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0278", "title": "Batch Size to Reach Compute-Bound Regime", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What minimum batch size pushes the main GEMM operations into the compute-bound regime, and what are the latency implications?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 3}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0279", "title": "Multi-LoRA Serving Architecture", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a serving architecture that minimizes GPU cost while meeting P99 < 300ms TTFT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0280", "title": "Resilient 1024-GPU Training System", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a fault-tolerant training system for a 175B model on 1024 GPUs where individual GPU failures occur every 2-4 hours?", "chain_ids": ["global-chain-auto-secondary-017-07"], "chain_positions": {"global-chain-auto-secondary-017-07": 1}, "chain_tiers": {"global-chain-auto-secondary-017-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0281", "title": "Multi-Model Serving Gateway Design", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a serving gateway for a company running 15 LLMs (7B to 70B) across 48 80GB GPUs?", "chain_ids": ["global-chain-auto-secondary-017-44"], "chain_positions": {"global-chain-auto-secondary-017-44": 1}, "chain_tiers": {"global-chain-auto-secondary-017-44": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0282", "title": "3D Parallelism Configuration for 175B", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the parallelism configuration (DP, TP, PP, micro-batch size) to train a 175B GPT-style model with global batch size 2048 and sequence length 2048?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 3}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0283", "title": "Trillion-Token Data Pipeline Architecture", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a data pipeline to feed this 1024-GPU training cluster under these constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0284", "title": "Training Observability Dashboard Design", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What metrics, alerts, and dashboards do you build?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0285", "title": "Memory Budget for High-Concurrency LLM Serving", "topic": "memory-pressure-management", "competency_area": "memory", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the binding constraint that prevents serving 256 concurrent requests, and how would you resolve it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0286", "title": "Network Topology for 2048-GPU Training Cluster", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the optimal switch topology, link speeds, and total switch port count for this cluster?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 3}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0289", "title": "Power and Cooling for a 1000-GPU Cluster", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you specify the cooling technology, power distribution, and UPS configuration?", "chain_ids": ["global-chain-auto-secondary-017-57"], "chain_positions": {"global-chain-auto-secondary-017-57": 2}, "chain_tiers": {"global-chain-auto-secondary-017-57": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0291", "title": "Speculative Decoding Architecture", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you calculate the expected speedup assuming a 70% draft acceptance rate and specify memory allocation on an 80GB GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0292", "title": "Expert Parallelism for MoE Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you specify the expert partitioning, handle token routing all-to-all communication, and estimate the communication overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0293", "title": "Real-Time Tokenization Pipeline at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you specify the architecture to achieve 10M tokens/sec throughput?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0294", "title": "TTFT Latency Spike Diagnosis", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of the TTFT latency spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0295", "title": "Sudden Training Throughput Drop", "topic": "interconnect-topology", "competency_area": "parallelism", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is happening to cause this sudden 38% throughput drop?", "chain_ids": ["global-chain-auto-secondary-017-26"], "chain_positions": {"global-chain-auto-secondary-017-26": 0}, "chain_tiers": {"global-chain-auto-secondary-017-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0296", "title": "OOM During Evaluation but Not Training", "topic": "activation-memory", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the model OOM during evaluation but not during training?", "chain_ids": ["global-chain-auto-secondary-017-59"], "chain_positions": {"global-chain-auto-secondary-017-59": 1}, "chain_tiers": {"global-chain-auto-secondary-017-59": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0297", "title": "Loss Plateau After Learning Rate Warmup", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the likely cause of the training loss plateau?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0298", "title": "Mysterious 15% Throughput Drop at Noon", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the consistent 15% throughput drop between 11 AM and 3 PM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0299", "title": "Replication vs Erasure Coding for Checkpoint Storage", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate storage overhead, write throughput, and recovery time for 3-way replication vs 10+4 Reed-Solomon erasure coding?", "chain_ids": ["global-chain-auto-secondary-017-07"], "chain_positions": {"global-chain-auto-secondary-017-07": 0}, "chain_tiers": {"global-chain-auto-secondary-017-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0300", "title": "Operational Intensity Ridge Point Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is a matrix multiplication layer with 100 FLOPs per byte memory-bound or compute-bound on an A100 80GB?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 2}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0301", "title": "FP16 Model VRAM and Load Time Estimation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a 7B FP16 LLM fit on a 16GB GPU, and what is the minimum time to read its weights once?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0302", "title": "INT8 Quantization Decoding Speed on H100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the memory savings achieved by quantization and the maximum theoretical decoding speed per token, assuming inference is strictly memory-bandwidth bound?", "chain_ids": ["global-chain-auto-secondary-014-14"], "chain_positions": {"global-chain-auto-secondary-014-14": 1}, "chain_tiers": {"global-chain-auto-secondary-014-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0303", "title": "Tensor Parallelism GPU Count and NVLink Delay", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you calculate the minimum number of GPUs required for weights, and what is the NVLink transfer time for a 12MB activation tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0304", "title": "Pipeline Bubble Delay Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the pipeline bubble delay added by transferring a 250MB intermediate activation tensor between the two nodes over the network?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0305", "title": "GPU Time Savings for Training Run", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many training hours are saved by using one A100 (312 TFLOPS peak) instead of one V100 (125 TFLOPS peak) for 10^21 FLOPs at 30% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0307", "title": "KV Cache Capacity Calculation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you determine the available memory for KV cache and calculate how many total tokens can be stored concurrently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0308", "title": "Offloaded Weights Latency Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the total time and identify the primary bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0309", "title": "SRAM Constrained INT8 Model Sizing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the largest INT8 keyword-spotting model that can fit in the remaining SRAM of a Cortex-M4 device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0310", "title": "INT4 Token Generation Throughput", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does FP16 fail, and what is the max theoretical token generation throughput using INT4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0311", "title": "Pipeline Parallelism NVLink Transfer Delay", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the per-GPU weight memory footprint and compute the transfer delay if 150MB activations are sent between stages via NVLink.", "chain_ids": ["global-chain-auto-secondary-017-64"], "chain_positions": {"global-chain-auto-secondary-017-64": 0}, "chain_tiers": {"global-chain-auto-secondary-017-64": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0312", "title": "Ring All-Reduce Synchronization Delay", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long should a ring all-reduce take to synchronize a 2GB gradient tensor across exactly two nodes over IB NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0313", "title": "Cost Calculation for Continuous Generation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the exact dollar cost incurred to generate exactly 1.5 million tokens at continuous full load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0314", "title": "Datacenter Rack Power Draw Estimation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What facility power does an 8x A100 server draw after adding host power and applying a PUE of 1.25?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0315", "title": "Batch Size and Token Generation Time", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you determine the maximum batch size, and calculate the memory-bound token generation time for that full batch?", "chain_ids": ["global-chain-auto-secondary-014-16"], "chain_positions": {"global-chain-auto-secondary-014-16": 1}, "chain_tiers": {"global-chain-auto-secondary-014-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0317", "title": "KV Cache Memory Sizing for 8B Model on 32GB V100", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum context length can an 8B FP16 model support on a single 32 GiB V100 after reserving memory for its KV cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0318", "title": "Sizing Cluster Memory for 65B Parameter Model", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the total memory needed for optimizer states, gradients, and parameters, and determine the minimum number of GPUs required?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 1}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0319", "title": "Latency Comparison: PCIe Gen4 vs NVLink for KV Cache Offload", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the latency difference between these two interconnect pathways for the full 40GB transfer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0335", "title": "Topology-Aware Placement Rule", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should an inference scheduler place 8 model shards to avoid topology-induced communication bottlenecks?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 2}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0339", "title": "Data Validation Budget Optimization", "topic": "data-quality-validation", "competency_area": "data", "track": "global", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you reduce validation CPU cost by 80% while still catching schema regressions in 100M daily records?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0358", "title": "Extreme Quantization Acceptance Spec", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What acceptance criteria should gate a 2-bit quantized model for mobile and edge release?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0362", "title": "Cross-Regime Recovery Budget", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design failure recovery for 2 TB cloud checkpoints, 20 MB mobile adapters, and 64 KB TinyML calibration state?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0374", "title": "Specify Overlap Acceptance Criteria", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What acceptance criteria should be specified before claiming communication-computation overlap?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0375", "title": "Cross-Regime Compute Cost Spec", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "global", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What inputs are required to estimate cost per successful interaction across cloud, mobile, and edge inference when retries can fail?", "chain_ids": ["global-chain-auto-secondary-017-61"], "chain_positions": {"global-chain-auto-secondary-017-61": 1}, "chain_tiers": {"global-chain-auto-secondary-017-61": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0378", "title": "Incomplete Compute Cost Procurement Decision", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "global", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What information is missing before choosing buy, rent, or on-device inference, and what can be bounded from FLOPs and traffic alone?", "chain_ids": ["global-chain-auto-secondary-017-61"], "chain_positions": {"global-chain-auto-secondary-017-61": 2}, "chain_tiers": {"global-chain-auto-secondary-017-61": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0397", "title": "Overlap Window Sizing", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What communication time remains exposed if 240 ms of all-reduce can overlap with only 180 ms of backward compute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0401", "title": "Cluster Rollout Capacity With GPU Pods", "topic": "container-orchestration", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is draining 10% of 60 GPU pods safe at 1,350 rps if utilization must stay below 80%?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0421", "title": "Global Observability Signal Gap", "topic": "monitoring-observability", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can cloud metrics look healthy while mobile task success drops after the same model rollout?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0432", "title": "Evaluating Continuous vs Fixed Batching Queues", "topic": "queueing-theory", "competency_area": "latency", "track": "global", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether the endpoint can remain stable under an arrival rate of 100 req/s, and calculate the minimum average batch size required.", "chain_ids": ["global-chain-auto-secondary-017-65"], "chain_positions": {"global-chain-auto-secondary-017-65": 0}, "chain_tiers": {"global-chain-auto-secondary-017-65": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0434", "title": "Memory Bandwidth Bottleneck Analysis for Custom Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the arithmetic intensity of a 64x64 block load and calculate the required data reuse to saturate the tensor cores.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0435", "title": "Heterogeneous Routing Algorithm for LLM Multi-Region Deployment", "topic": "queueing-theory", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a load-shedding and routing algorithm that minimizes p99 latency during unpredictable traffic surges without dropping requests.", "chain_ids": ["global-chain-auto-secondary-017-65"], "chain_positions": {"global-chain-auto-secondary-017-65": 1}, "chain_tiers": {"global-chain-auto-secondary-017-65": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0436", "title": "Evaluating Synchronous vs Asynchronous Checkpointing Trade-offs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate which checkpointing strategy yields higher goodput if a node fails exactly once every 10 hours.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0437", "title": "Designing 3D Parallelism Topology for Inter-Node Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create an optimal placement strategy for Tensor (TP), Pipeline (PP), and Data Parallelism (DP) to prevent the 400 Gbps links from starving the GPUs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0438", "title": "Evaluating Compute-Communication Overlap in MoE Layers", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "global", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether dividing the workload into 4 overlapping micro-steps completely hides the AllToAll communication latency behind compute.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0440", "title": "Multi-Tenant LoRA Serving Architecture on MI300X", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you manage memory and request batching to serve all adapters without reloading the base model?", "chain_ids": ["global-chain-auto-secondary-011-32"], "chain_positions": {"global-chain-auto-secondary-011-32": 2}, "chain_tiers": {"global-chain-auto-secondary-011-32": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0001", "title": "The OTA Cellular Limit", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical size limit you must stay under for a cellular download that doesn't require explicit user opt-in?", "chain_ids": ["mobile-chain-auto-001-10"], "chain_positions": {"mobile-chain-auto-001-10": 0}, "chain_tiers": {"mobile-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20 MB", "2 GB", "~200 MB", "Unlimited, as long as the user has a data plan."], "correct_index": 2}}, {"id": "mobile-0004", "title": "The OTA Data Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the data consumption for a single user if you ship the full update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The update will only transfer the 40 MB difference, which is a manageable size.", "The 10% accuracy gain is worth the user data cost; we should ship the 120 MB update immediately.", "The update will consume 120 MB of the user's data plan, a significant cost we must address.", "The update is about 960 Megabits (Mb), which requires a Wi-Fi connection."], "correct_index": 2}}, {"id": "mobile-0005", "title": "The OTA Update Budget Shock", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What total carrier data cost results from rolling out a 350 MB model update to 5 million users at $2 per GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$35,000", "$350,000", "$3,500,000", "$1,750,000"], "correct_index": 2}}, {"id": "mobile-0006", "title": "The OTA Bandwidth Bottleneck", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following is the most critical physical constraint to consider for the update delivery mechanism?", "chain_ids": ["mobile-chain-auto-001-05", "mobile-chain-auto-001-06"], "chain_positions": {"mobile-chain-auto-001-05": 0, "mobile-chain-auto-001-06": 0}, "chain_tiers": {"mobile-chain-auto-001-05": "primary", "mobile-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["On-device flash storage capacity", "CPU cycles required for model decompression", "Unreliable, low-bandwidth cellular connectivity", "Power consumption during the download"], "correct_index": 2}}, {"id": "mobile-0008", "title": "The OTA Budget Constraint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Will a 70M-parameter FP16 model fit under a 150 MB OTA update budget?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["280 MB, so it fails the requirement (Assuming FP32).", "70 MB, so it easily meets the requirement (Assuming INT8).", "140 MB, so it meets the requirement.", "17.5 MB, so it easily meets the requirement."], "correct_index": 2}}, {"id": "mobile-0010", "title": "The Conversion Cliff", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What went wrong during the seemingly successful PyTorch to CoreML conversion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0011", "title": "The App Store Privacy Rejection", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did Apple reject an on-device Core ML face-age filter for missing a face-data purpose string, and what must change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apple treats face geometry as biometric data regardless of processing location. Fix: (1) add NSFaceIDUsageDescription with a clear purpose string, (2) update the Privacy Nutrition Label to declare biometric data, (3) add a runtime consent dialog before first camera access. Prevention: 35 minutes.", "We don't collect face data — the model runs on-device, so there's no privacy issue. Apple's privacy framework doesn't distinguish between on-device processing and server-side collection for certain sensitive data categories.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance.", "Deploy the update to all devices simultaneously to minimize the total rollout duration and reduce the window of version inconsistency."], "correct_index": 0}}, {"id": "mobile-0012", "title": "The App Size Limit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can a 500 MB FP32 model ship in an app that must stay under the 200 MB cellular download limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0013", "title": "The App Store Model Size Rejection", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a 350 MB diffusion model be delivered to avoid the iOS 200 MB cellular download limit warning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0014", "title": "The ML Crash vs Silent Failure", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you detect and diagnose ML failures that don't crash the app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The accuracy degradation is caused by numerical instability in the framework's matrix multiplication kernels, not by the model or data.", "If there are no crashes, the model is working fine.. ML models fail silently — they return valid tensors filled with garbage instead of throwing exceptions.", "ML failures are invisible to standard crash reporting because the model always returns *something* — a valid tensor of zeros, random confidences, or stale results. You need ML-specific health monitoring.", "The system should be redesigned to offload inference to the cloud, as edge hardware fundamentally cannot meet the latency and accuracy requirements."], "correct_index": 2}}, {"id": "mobile-0015", "title": "The App Store ML Review Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the Core ML model compilation step fail on older ANE hardware, and how must you design your model delivery to handle Apple's heterogeneous hardware matrix?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0016", "title": "The Feature Flag Footgun", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does enabling six ML features at once make a 6 GB RAM device kill apps and the launcher?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Each model is only 20-30 MB, so 6 models = 180 MB. That should easily fit in 6 GB RAM.", "The problem is cumulative ML runtime memory, not model file size. Each model's runtime footprint is 3-5x file size. Six models at ~100 MB runtime each = 600 MB, creating severe memory pressure.", "OTA updates should always include the full model file to ensure atomicity; delta updates risk corrupting the model.", "Moving to a memory-mapped file loading strategy will reduce peak memory to near-zero since only accessed weights are loaded."], "correct_index": 1}}, {"id": "mobile-0017", "title": "The ML Error That Looks Like a Feature", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an iOS update make portrait mode look more dreamy, and should the camera app ship or fix the change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Rollback capability is unnecessary if the model was validated in the cloud.", "If engagement is up by 5%, the model is working better — don't touch it.", "The safest strategy is to update firmware and model in a single atomic package to avoid version skew between them.", "Diagnose first: The 15% lower pixel values in P3 cause an 8% depth estimation error, resulting in incorrect blur."], "correct_index": 3}}, {"id": "mobile-0018", "title": "The ML Notification Backlash", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does 78% precision make news notifications feel wrong at launch scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["78% precision at high volume is a UX disaster: 22% irrelevant x 40 notifications/day = 8.8 bad notifications, far exceeding user tolerance of 3-5 total. Fix: cap at 3 notifications/day, selecting highest-confidence predictions. Top-3 from 40 candidates achieves ~95% precision.", "The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "78% precision means 78% of notifications are relevant — that's pretty good.", "OTA updates should always include the full model file to ensure atomicity; delta updates risk corrupting the model."], "correct_index": 0}}, {"id": "mobile-0020", "title": "The Secure Enclave Boundary", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you protect the embedding in transit, and why can't you just run the ML model inside the Secure Enclave?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Edge-side health checks should focus on hardware metrics; model-level metrics are too noisy.", "The Secure Enclave has ~256 KB of memory. It cannot run a model requiring ~25 MB of working memory.", "Run the face verification model inside the Secure Enclave for end-to-end security.", "Model obfuscation through weight scrambling provides equivalent security to hardware-backed encryption."], "correct_index": 1}}, {"id": "mobile-0021", "title": "The Cold Start Jitter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What system-level factors contribute to this initial delay, and how would you optimize them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0022", "title": "The Offline-First ML Design", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design an offline-first architecture that bridges the accuracy gap within a strict on-device storage budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The memory leak is in the framework's autograd graph; disabling gradient computation with torch.no_grad() will fix it.", "Apply INT8 post-training quantization directly to ResNet-152 to shrink it from 230 MB to 57 MB to fit on-device.", "Design a tiered offline-first system: ship the domain-tuned 5.3 MB model on-device. This achieves a 43x size reduction and easily fits within the device's RAM and storage budgets.", "Prune the 230 MB model dynamically at runtime based on available device RAM."], "correct_index": 2}}, {"id": "mobile-0023", "title": "The On-Device Model Hot-Swap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you push a model fix in under 1 hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Ship the model as a server-side config and swap it remotely.", "Model weights are executable code, taking 4 hours to recompile on device.", "The solution is to reduce model complexity until it fits within the hardware constraints.", "Build dynamic model delivery within platform guidelines using a CDN and atomic pointer swap."], "correct_index": 3}}, {"id": "mobile-0024", "title": "The Silent Accuracy Degradation", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can you detect six-month on-device classifier degradation when there are no server-side ground-truth labels?", "chain_ids": ["mobile-chain-auto-secondary-006-25"], "chain_positions": {"mobile-chain-auto-secondary-006-25": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Monitor accuracy using a held-out test set.. You don't have labels for on-device predictions — there's no test set.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "The system should be redesigned to offload inference to the cloud, as edge hardware fundamentally cannot meet the latency and accuracy requirements.", "Four proxy signals detect degradation without ground truth: (1) Confidence distribution shift — track KL divergence of softmax outputs (~1 byte/inference, 30 KB/month). (2) Prediction entropy — rising entropy signals increasing uncertainty. (3) Feature drift — monitor input distribution statistics."], "correct_index": 3}}, {"id": "mobile-0025", "title": "The A/B Test Without Ground Truth", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you A/B test two on-device models when you can't observe the predictions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The key insight is that on mobile, the hardware itself generates the telemetry you need. Design an on-device A/B testing framework that uses hardware-level signals as proxy metrics when ground truth is unavailable.", "Send all predictions to the server for analysis. This defeats the purpose of on-device inference (privacy, latency) and may violate your privacy policy.", "The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes.", "The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable."], "correct_index": 0}}, {"id": "mobile-0026", "title": "The On-Device Model Encryption Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can Core ML and TFLite on-device model bundles be protected after a competitor extracts them in 10 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Defense-in-depth with hardware-backed protection: Core ML encryption (requires Secure Enclave exploit) and Android split architecture.", "Encrypt the model file with AES-256 and decrypt it at runtime. This protects the file at rest, but the decrypted model must live in memory during inference.", "Use ProGuard or DexGuard to obfuscate the app's Java/Swift code, which automatically encrypts the bundled assets like ML models.", "Deploy the models as dynamic libraries (.so or .dylib) instead of flatbuffers, which prevents them from being reverse-engineered."], "correct_index": 0}}, {"id": "mobile-0027", "title": "The Mobile ML Telemetry Budget", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design the telemetry system to keep ML telemetry for 5M DAU under a $500/month budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0028", "title": "The Model Rollback Nightmare", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you roll back only Samsung Galaxy S21 Android 13 users after a 40% no-detection spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0029", "title": "The ML Crash Report Black Hole", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose and fix a rare kernel crash without source access or local reproduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0030", "title": "The App Size Audit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can you find at least 60 MB of app size savings while preserving all four mobile ML features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0031", "title": "The Mobile ML CI/CD Pipeline", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should CI/CD stop PyTorch-to-Core ML/TFLite handoffs from shipping a wrong-dataset model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0032", "title": "The Model Cache Eviction Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 30 MB photo-enhancement model in iOS Caches keep disappearing and taking 12s to reload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0033", "title": "The Device-Free ML Testing Strategy", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a camera ML team cover 200M iOS and Android users with only 5 local test phones?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0034", "title": "The Cross-Version Compatibility Maze", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ship one app that works optimally across all these OS versions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Target the lowest common denominator — use only features available on iOS 16 and Android 10.", "Model tiering keyed to runtime capabilities, not OS version. Probe available APIs at launch: CoreML version, TFLite delegate support, quantization compatibility.", "Bundle all three model variants (48 MB total) to ensure immediate offline availability without runtime downloads.", "Use ONNX Runtime exclusively to bypass OS-level APIs and guarantee identical execution across all versions."], "correct_index": 1}}, {"id": "mobile-0035", "title": "The Mobile A/B Testing Infrastructure at Scale", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do 12 concurrent mobile ML experiments interfere across 2M DAU, and how should assignment be isolated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The interference comes from overlapping experiments sharing system resources on the same device.", "Run each experiment on a separate user cohort. With 12 experiments and 2M DAU, each cohort gets ~167K users.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance.", "Parallelizing pipeline stages across CPU and GPU always improves throughput, regardless of the relative speeds of each stage."], "correct_index": 0}}, {"id": "mobile-0036", "title": "The Performance Regression Detective", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What non-model regression could explain identical model binaries becoming slower after a mobile app update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0037", "title": "The Production Mobile ML Observability Stack", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the observability system that tells you, within 5 minutes, if any model is degrading on any device segment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0038", "title": "The Delivery Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What breaks at this scale that doesn't break in a demo?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 4}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The safest strategy is to update firmware and model in a single atomic package to avoid version skew between them.", "The OOM is caused by memory fragmentation in the Python runtime; running garbage collection before inference will prevent it.", "Just use on-demand download after install — it's under 2 GB. This works for a prototype but collapses at 500M-device scale.", "At 500M installs, every decision multiplies by 500M. The system design must address five interacting constraints: 1. CDN Cost: Downloading 1.7 GB for every minor update is prohibitively expensive ($8.5M/update)."], "correct_index": 3}}, {"id": "mobile-0039", "title": "The Multi-Model Orchestration Nightmare", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you orchestrate pose, speech, and gesture models concurrently on one mobile SoC without missing real-time budgets?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0040", "title": "The User Consent Minefield", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does on-device federated learning require explicit user consent under GDPR even if data never leaves the device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0042", "title": "The Federated Keyboard", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you train the model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0043", "title": "The On-Device Personalization Privacy", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you size and manage LoRA personalization for an on-device CLIP-like photo search model regarding memory, time, and migration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0044", "title": "The Keyboard Prediction Privacy Leak", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can top-5 keyboard predictions over 1000 queries leak private text even when the model stays on-device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0045", "title": "The On-Device Differential Privacy Budget", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can federated health training fit 100 rounds under a yearly differential privacy budget of epsilon 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0047", "title": "The Federated Learning System for a Social Media App", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a federated learning system for a social media app with 500 million daily active users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0049", "title": "The Privacy-Utility Squeeze", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What should you tell the PM when epsilon 2 differential privacy makes federated next-word prediction 40% worse, and how can you improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0052", "title": "The 50-Feature Mobile ML Platform", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a super-app with 50 ML features fit on 3 GB phones without shipping 50 separate full models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0053", "title": "The Model Loading I/O Cliff", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is there a 23x difference in load time, and how do you fix the user experience on budget Android phones?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0054", "title": "The Cellular Download Wall", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a platform distribution perspective, what is the most critical, immediate problem with this approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large to fit into the device's RAM during runtime.", "It exceeds the ~200 MB cellular download limit, preventing users from installing the app without Wi-Fi.", "The 300 MB app size will take up too much of the user's total phone storage.", "Large app binaries significantly increase the time it takes for app store review and approval."], "correct_index": 1}}, {"id": "mobile-0058", "title": "The ANR Timeout", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the name of this event, and what is the standard time limit that is being exceeded?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Jank Event, 16 ms", "ANR Event, 5 seconds", "System Crash, 1 second", "ANR Event, 30 seconds"], "correct_index": 1}}, {"id": "mobile-0059", "title": "The 60 FPS Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the maximum permissible latency for your model's inference to avoid causing UI 'jank', and what does 'jank' mean in this context?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 1}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["33.33ms.", "16.67ms.", "Significantly less than 16.67ms, because the model shares the frame budget with UI rendering and other logic.", "66.67ms."], "correct_index": 2}}, {"id": "mobile-0060", "title": "The Synchronous Inference ANR", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the exact user experience on an Android device when that button is tapped?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The app will feel unresponsive for 6 seconds, then continue normally.", "The app will immediately crash due to an out-of-memory error.", "The UI will freeze, and after 5 seconds, an 'Application Not Responding' (ANR) dialog will appear.", "The OS will automatically restart the app after detecting the long-running task."], "correct_index": 2}}, {"id": "mobile-0061", "title": "The App Store Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can an 8 GB smartphone safely A/B test a 1.5 GB generative model when the app already uses 400 MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it's fine. The 1.5 GB model is much smaller than the device's 8 GB of RAM.", "Yes, it should be okay. The app's total memory budget is 2 GB, and the 1.5 GB model fits within that.", "No, it's too risky. The total required memory of 1.9 GB (1.5 GB model + 0.4 GB app) leaves almost no headroom within the ~2 GB per-app budget.", "No, it's impossible. The 1.5 GB model alone is larger than the app's base memory of 400 MB."], "correct_index": 2}}, {"id": "mobile-0063", "title": "The Cold Start Problem", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What creates a 500ms first-inference delay when later mobile ML inferences take only 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0064", "title": "The Battery Blame Game", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If ML averages only 20 mW, what drains 1% battery per minute in the workout app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0065", "title": "The Model Warm-up on Mobile", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the first CoreML inference on iPhone 15 Pro 100-250x slower than the 8ms steady-state path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0066", "title": "The Accessibility Conflict", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does VoiceOver read 'image' before an iPhone 15 on-device description model finishes social feed inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0067", "title": "The Accessibility Breakage", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What went wrong with the accessibility integration for the on-device ML smart replies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0069", "title": "The App Store Binary Size Limit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can a 60 MB iOS app deliver an 800 MB INT8 image-generation model without exceeding the 200 MB cellular limit?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0072", "title": "The Heterogeneous Scheduling Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do camera detection and LLM decoding slow down together on Snapdragon 8 Gen 3 despite separate compute units, and how do you fix it?", "chain_ids": ["mobile-chain-auto-019-05"], "chain_positions": {"mobile-chain-auto-019-05": 0}, "chain_tiers": {"mobile-chain-auto-019-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0073", "title": "The Display Pipeline Collision", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 60 FPS AR segmentation mask lag visibly when the iPhone display switches to 120 Hz?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0074", "title": "The Concurrency Collision", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can two simultaneous NPU models each fall to 30% of solo FPS instead of sharing performance evenly (50%)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0075", "title": "The Janky Background App", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What common mobile systems issue is likely at play, and how would you try to identify the culprit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0076", "title": "The NPU Efficiency Advantage", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the NPU achieve 3x better TOPS/W for the same workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0077", "title": "The Photo Segmentation Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the latency breakdown for Pixel Portrait Mode, and which stage is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0078", "title": "The Model Update Delta Compression", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Design a delta update system, calculate the expected patch size, and explain the unique challenge for quantized models.", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0081", "title": "The Camera Preview Stutter: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the beauty filter stutter every 2-3s when inference is a stable 8ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0082", "title": "The Hardware Decoder Synchronization", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is fundamentally wrong with putting hardware decoding in the critical path of a synchronous loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0083", "title": "The Double JPEG Decode Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where does the 140ms overhead in image picking come from, and how can it be bypassed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0084", "title": "The CoreML Model Compilation Jitter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the first Core ML initialization freeze the UI for 3 seconds when the bundled .mlmodel starts the camera?", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0085", "title": "The Android NNAPI Driver Fallback", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is NNAPI making the model 16x slower than it should be on a specific device, and how do you fix it?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0086", "title": "The Thermal Cliff", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is happening, and why can't you just 'push through it'?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0087", "title": "The Zero-Copy Imperative", "topic": "extreme-quantization", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do memory copies affect a real-time mobile video pipeline, and what zero-copy design should replace them?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0088", "title": "The Adaptive Bitrate Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an adaptive system that switches between INT8 and INT4 precision based on thermal state, and calculate the latency and accuracy at each operating point?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0089", "title": "Optimal Heterogeneous Graph Execution", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the execution strategy to minimize overall latency and power consumption on a mobile SoC with these heterogeneous compute units?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0090", "title": "The Inference Timing Jitter Mystery", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the same computation take 3x longer sometimes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0091", "title": "The Cross-Platform ML Runtime Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you choose between native runtimes, ONNX Runtime, and a hybrid stack for cross-platform mobile inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0092", "title": "The 1000-Device Android Fragmentation Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ship one model across 1000+ Android devices that works reliably and efficiently?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0093", "title": "The Cross-Platform Model Optimization", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many optimized model binaries do you actually need, and what does the build pipeline look like?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0094", "title": "The Audio Pipeline Latency Creep", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 50 ms acoustic anomaly model alert nearly a second late when Android AudioRecord captures 1 s chunks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0095", "title": "The Battery Accounting Inversion", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does cutting NPU inference from 10 ms to 5 ms with INT8 increase feature battery drain by 15%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0096", "title": "The Adaptive Power Maestro", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a system to dynamically adapt the SoC's operating parameters to meet both the performance and power constraints?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 5}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0097", "title": "The OS Scheduler's Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How does the mobile OS scheduler interact with these heterogeneous workloads, and what challenges arise in ensuring QoS?", "chain_ids": ["mobile-chain-auto-019-05"], "chain_positions": {"mobile-chain-auto-019-05": 2}, "chain_tiers": {"mobile-chain-auto-019-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0098", "title": "The Async Camera Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you architect this so the user sees smooth video with accurate segmentation?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 5}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0099", "title": "The Hidden Broadcast Receiver Wake-Ups", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why do 10 SMS inferences taking 2 ms each make an Android phishing detector a top 1% battery drainer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0100", "title": "The Double FPU Context Save", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does floating-point C++ preprocessing make context switches 3x slower on a wearable companion chip?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0101", "title": "The Sustained vs Burst Reality", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did ten 12ms benchmark runs miss the two-minute thermal collapse of a 60 FPS AR filter?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 2}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0102", "title": "The Cellular Modem Power Surprise", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "You're uploading 50 bytes per second — how can that halve battery life?", "chain_ids": ["mobile-chain-bucket-powerbud-06"], "chain_positions": {"mobile-chain-bucket-powerbud-06": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0103", "title": "The Throttling Treadmill", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does object detection latency rise from 20 ms to 60 ms after 30 seconds of continuous on-device use?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0104", "title": "The Silent Battery Drain", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What common pitfalls in mobile ML background processing lead to excessive battery drain, and how can you mitigate them?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0105", "title": "The Battery Drain Dilemma", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why might a GPU use more peak power but complete inference faster than a CPU, and which is better for battery life?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0106", "title": "The Thermal Throttling Death Spiral", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the game halve framerate after 15 minutes even with 50% CPU/GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0107", "title": "The Accelerometer Inference Power", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can you keep total power under 5 mW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0108", "title": "The \"Warm-Up\" Performance Drop", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary cause for this sustained performance drop, and how is it confirmed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0109", "title": "The TDP Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 45 TOPS chip tie with a 35 TOPS chip, and what does this tell you about evaluating mobile SoCs for ML?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0110", "title": "The Thermal Throttling Trap", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 15ms Android AR filter slow to 40ms after five minutes on a mobile SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0111", "title": "The DVFS Polling Delay", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does doing more work (scrolling + inference) make the inference 2x faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0112", "title": "The Background ML Battery Drain", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a tiny MobileNetV3 photo tagger with only 600 ms of total inference compute drain 15% of the battery overnight?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0113", "title": "The Power Hungry Framework", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What factors explain the 20-30% higher power draw of TFLite on Android vs Core ML on iOS, and how can it be optimized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0114", "title": "The Pocket Furnace", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 3B INT4 assistant overheat a mobile device during a 10-minute chat, and how can it stay responsive while remaining within the thermal budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0115", "title": "The Background ML Battery Vampire", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 0.3 ms model drain 15% of a 4355 mAh battery?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0116", "title": "The Power Domain Juggling Act", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does adding a tiny vision model to the low-power DSP inflate the system's power budget by over 100x?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0117", "title": "The Sustained Performance Cliff", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the most likely cause, and how would you design your ML system to provide more sustained performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0118", "title": "The Background Thermal Throttling", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is the background execution 9x slower than the foreground execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0119", "title": "The Background Inference Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many inferences can you run per background cycle, and what is the energy cost of running on NPU vs CPU?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0120", "title": "The Thermal Throttling Prediction", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a style-transfer app stay smooth when thermal throttling doubles latency after 45s?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 4}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0121", "title": "The Profiling Tool Blind Spot", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do TFLite benchmark, Systrace, and Perfetto report 8ms, 12ms, and 15ms for the same inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0122", "title": "The iOS vs Android ML Framework Maze", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is there a 2x performance gap, and how do you close it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0123", "title": "The Dynamic Shape Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does dynamic shape support cost so much, and when is padding to a fixed shape the better strategy despite wasting compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0124", "title": "The CoreML Custom Op Conversion Failure", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is simply removing the RoPE layer a bad idea, and how should unsupported ops be handled in CoreML?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0125", "title": "The CoreML Conversion Black Hole", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose and fix a silently broken Core ML conversion of a custom GLU operation in a 45-layer PyTorch model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0126", "title": "The Dynamic Shape Recompilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is CoreML doing during those extra 13 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0127", "title": "The WebGPU ML Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "When is WebGPU a reasonable alternative to a native mobile ML SDK for a 50M-parameter image classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0128", "title": "The Cellular Model Download Failure", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does a 180 MB first-launch model download fail for 40% of India and Southeast Asia users on 4G?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0129", "title": "The Big.LITTLE Task Migration", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a CPU hand-tracking model jump from 8 ms to 30 ms after 2 minutes without thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0131", "title": "The Camera VSync Deadlock", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 12ms ML stage with 4.6ms of headroom still drop a 60Hz camera pipeline to 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0132", "title": "The Memory-Mapped File Deadlock", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is a background memory read stalling the foreground UI?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0136", "title": "The NPU Energy Advantage", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following processor types on a standard mobile SoC (System on a Chip) is the most **energy-efficient** choice for these operations?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU, because it has the highest parallel throughput (TOPS).", "The CPU, because it avoids the latency overhead of memory transfers to another processing unit.", "The NPU, because its specialized hardware for low-precision integer math is vastly more energy-efficient.", "The NPU, but it's only slightly more efficient (e.g., 1.5-2x) than the CPU."], "correct_index": 2}}, {"id": "mobile-0138", "title": "The CPU Fallback Energy Tax", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately how much more energy-efficient is the INT8 version compared to the FP32 version for that specific operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x more efficient", "~2x more efficient", "~18x more efficient", "~100x more efficient"], "correct_index": 2}}, {"id": "mobile-0141", "title": "The Mobile Thermal Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the most likely physical limit the System-on-a-Chip (SoC) is encountering, and what is its approximate power value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The SoC is hitting a software power limit of around 30 Watts.", "The model is likely hitting a memory bandwidth limit of a few hundred milliwatts.", "The SoC is thermal throttling, hitting its sustainable power budget of 3-5 Watts.", "The operating system is de-prioritizing the app due to a 3W memory leak over 60 seconds."], "correct_index": 2}}, {"id": "mobile-0142", "title": "The Fusion Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When a framework like Core ML or TensorFlow Lite applies operator fusion, what is the primary hardware cost it aims to reduce?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The total number of arithmetic computations (FLOPs).", "The model's storage footprint on the device's flash memory.", "The latency and power cost of writing intermediate tensors to main memory (DRAM).", "The time spent delegating unsupported operators to the CPU."], "correct_index": 2}}, {"id": "mobile-0143", "title": "The Privacy Wall", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason you must use a privacy-preserving technique like federated analytics instead of simply logging raw camera images and sending them to your servers?", "chain_ids": ["mobile-chain-auto-secondary-006-25"], "chain_positions": {"mobile-chain-auto-secondary-006-25": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The daily cellular data usage would be too expensive for users.", "The constant network requests would drain the device's battery too quickly.", "Uploading raw user data is a major privacy violation and breaks user trust.", "The on-device storage is insufficient to buffer the image logs before uploading."], "correct_index": 2}}, {"id": "mobile-0144", "title": "The Privacy Memory Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a hardware resource perspective, what is the most significant new constraint introduced by adding on-device DP to the training process?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased CPU usage from the cryptographic noise generation.", "Increased network bandwidth to send the larger, noisy model updates.", "A major increase in peak RAM to store per-example gradients.", "Increased flash storage needed to save the privacy-preserving model."], "correct_index": 2}}, {"id": "mobile-0146", "title": "The Cellular Download Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What cellular download size limit should mobile ML model delivery usually be designed around?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~50 MB", "~200 MB", "14 GB", "No limit, it depends on the user's data plan"], "correct_index": 1}}, {"id": "mobile-0147", "title": "The NPU Fallback Power Tax", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the new per-inference power draw when 20% of an NPU workload falls back to a CPU that is 10x less efficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["120 mW (Mistake: Assumes a simple linear 20% power increase)", "200 mW (Mistake: Calculates only the power for the CPU portion)", "280 mW", "1000 mW (Mistake: Applies the 10x penalty to the entire original power draw)"], "correct_index": 2}}, {"id": "mobile-0149", "title": "The NPU Fallback Memory Cost", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total memory in megabytes (MB) required to store the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10.0 MB", "5.0 MB", "9.5 MB", "5.5 MB"], "correct_index": 2}}, {"id": "mobile-0151", "title": "The Mobile Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum duty cycle percentage the model can be active to stay within the 5W thermal budget?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["71%", "100%", "50%", "29%"], "correct_index": 2}}, {"id": "mobile-0152", "title": "The Operator Fusion Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much latency does fusing a Conv2D and ReLU save when it removes one dispatch and one memory round-trip?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1,000 ns (Ignores memory round-trip cost)", "5,000 ns (Ignores kernel dispatch overhead)", "6,000 ns (Correctly sums dispatch and memory overhead)", "0 ns (Assumes no latency benefit since FLOPs are unchanged)"], "correct_index": 2}}, {"id": "mobile-0153", "title": "Federated Averaging's Blind Spot", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How can this seemingly small global drop hide a significant problem, and what is the actual accuracy degradation for the affected 20% of users?", "chain_ids": ["mobile-chain-auto-secondary-006-25"], "chain_positions": {"mobile-chain-auto-secondary-006-25": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The accuracy for affected users dropped by 1%. The problem is being exaggerated.", "The accuracy for affected users dropped by 20%. The model is completely broken for them.", "The accuracy for affected users dropped by 5%. A significant degradation is being masked by the fleet average.", "It's impossible to tell the local drop without knowing the new local accuracy value directly."], "correct_index": 2}}, {"id": "mobile-0154", "title": "The Federated Learning Upload Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary communication bottleneck for the user, and what is the data payload size uploaded per training round?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 MB", "20 MB", "10 MB", "40 MB"], "correct_index": 2}}, {"id": "mobile-0155", "title": "The Battery Drain A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much extra daily energy does Model B consume if it uses 1W more than Model A for 10 minutes per day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.0 W", "600 Wh", "~0.17 Wh", "~0.67 Wh"], "correct_index": 2}}, {"id": "mobile-0156", "title": "The Delta Update Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the approximate size of a delta patch to upgrade the model, and does it solve the OTA limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~170 MB", "~45 MB", "~175 MB", "220 MB"], "correct_index": 2}}, {"id": "mobile-0157", "title": "The 85% Delegation Fallacy", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can 15 CPU fallback operations dominate latency even when 85 of 100 operations run on the NPU?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The overall performance will be roughly 85% of the NPU's peak performance.", "The CPU is the bottleneck; the 15 unsupported ops take 45x longer than the 85 NPU ops.", "The NPU is the bottleneck because it executes 85% of the total operations.", "The CPU portion is negligible because the NPU is over 250x faster than the CPU."], "correct_index": 1}}, {"id": "mobile-0158", "title": "The App Memory Guillotine", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What safe memory budget should an Android ML feature target on a phone with 8 GB of RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 6-7 GB, since the OS only needs about 1-2 GB.", "Approximately 2 GB.", "Around 4 GB, which is half of the total RAM.", "Unlimited, as long as the user isn't running other apps."], "correct_index": 1}}, {"id": "mobile-0159", "title": "The Unified Memory Trap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental hardware difference that makes deploying a 14 GB model on a 12 GB smartphone infeasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The phone's NPU is not powerful enough to run a 7B model.", "The 12 GB of RAM is unified and shared with the OS, leaving a much smaller budget for the app.", "The phone's memory bandwidth is too low to handle the model's weights.", "The 14 GB model fits perfectly in the 12 GB RAM using virtual memory swapping without penalty."], "correct_index": 1}}, {"id": "mobile-0161", "title": "The Jank Instigator", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Interpret this situation: what happens to the frame rate, and what is the approximate new FPS?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 1}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The game will crash because the SoC is drawing too much power.", "The frame rate will drop to 30 FPS as the system cuts performance by half.", "The frame rate will remain at 60 FPS, but the phone will get dangerously hot.", "The frame rate will drop to approximately 50 FPS as the SoC scales down performance."], "correct_index": 3}}, {"id": "mobile-0162", "title": "The A17 Pro Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the A17 Pro ridge point from 35 TOPS and 51.2 GB/s mean for mobile model optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.68 Ops/Byte. This is a unit-confusion error (forgetting the 1000x difference between Tera and Giga).", "~1.46 Ops/Byte. This is an inversion error (dividing bandwidth by compute) and misinterprets the result.", "~684 Ops/Byte. This high value means most neural network layers will be compute-bound on the A17 Pro.", "~684 Ops/Byte. Layers must have an arithmetic intensity greater than this to be compute-bound."], "correct_index": 3}}, {"id": "mobile-0164", "title": "The App Budget Constraint", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Using half-precision floating point (FP16), what is the memory footprint of just the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["750 MB", "1.5 GB", "3.0 GB", "12 GB"], "correct_index": 1}}, {"id": "mobile-0165", "title": "The Mobile LLM's Memory Hog", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory is needed for 1024 tokens in a 20-layer, hidden-size-512 mobile LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20 MB", "40 KB", "40 MB", "80 MB"], "correct_index": 2}}, {"id": "mobile-0167", "title": "The Quantization Energy Dividend", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "About how much more energy-efficient is an INT8 MAC than an FP16 MAC for mobile inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x more efficient (linear scaling with bit-width)", "About 5x more efficient", "About 18x more efficient (assuming FP32 vs INT8 scaling)", "The savings are negligible (<10%) due to conversion overhead"], "correct_index": 1}}, {"id": "mobile-0168", "title": "The App Store Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary precision trade-off and new memory footprint when quantizing a 150M parameter model from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75 MB. Quantization from 32-bit floats provides a 4x reduction, so the 300 MB model becomes 75 MB.", "300 MB. Quantization to INT8 doesn't affect the stored weight size, only the precision of calculations during inference.", "150 MB. Each 2-byte FP16 parameter is reduced to a 1-byte INT8 parameter, halving the memory.", "1.2 GB. The model has 1.2 billion bits for its weights (150M x 8), which is roughly 1.2 GB."], "correct_index": 2}}, {"id": "mobile-0170", "title": "The Illusion of Symmetric Scaling", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do doubling model width and doubling input resolution each change FLOPs for a 5 GFLOP mobile vision model?", "chain_ids": ["mobile-chain-auto-secondary-010-09"], "chain_positions": {"mobile-chain-auto-secondary-010-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both strategies result in ~10 GFLOPs.", "Width scaling: ~20 GFLOPs; Resolution scaling: ~10 GFLOPs.", "Both strategies result in ~20 GFLOPs.", "Width scaling: ~10 GFLOPs; Resolution scaling: ~20 GFLOPs."], "correct_index": 2}}, {"id": "mobile-0171", "title": "The Mobile Jank Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth, 60 FPS user experience, what is the absolute maximum latency your model inference can have before the user perceives jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 ms (Standard web latency target)", "~33 ms (30 FPS budget)", "~16 ms", "~1 ms (Audio processing target)"], "correct_index": 2}}, {"id": "mobile-0172", "title": "The Duty Cycle Power Trap", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a 1-hour period, which model drains more battery?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model A (the 5W model)", "Model B (the 3W model)", "They drain the same amount", "It's impossible to tell without knowing the device's battery capacity"], "correct_index": 1}}, {"id": "mobile-0174", "title": "The Voice Assistant Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Will an audio inference queue stay stable if chunks arrive every 100ms and processing takes 90ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is unstable because at 90% utilization, there is no margin for error and the queue will eventually overflow.", "The system is stable because the processing time (90ms) is less than the arrival interval (100ms).", "The system is unstable because the workload (900ms per second) causes immediate thermal throttling.", "The system is stable but will have an infinite queue delay because 90ms leaves no time for memory IO."], "correct_index": 1}}, {"id": "mobile-0175", "title": "The Cost of a Glance", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate average power consumption of this feature over its 10-second cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 W", "2.5 W", "500 mW", "50 mW"], "correct_index": 2}}, {"id": "mobile-0178", "title": "The Fusion Overhead Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What percent overhead does an unfused Conv-ReLU memory roundtrip add for a 1 MB tensor at 51.2 GB/s?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.7% overhead. Fusion saves writing the intermediate tensor to DRAM.", "0% overhead. The memory access time is negligible compared to compute.", "~3.5% overhead. Fusion saves a DRAM write/read roundtrip.", "~22% overhead. The memory bandwidth is the main bottleneck."], "correct_index": 2}}, {"id": "mobile-0179", "title": "The 7B Parameter Illusion", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "As the mobile ML systems engineer, what minimum FP16 memory footprint should you cite to ground the conversation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~7 GB", "~28 GB", "~14 GB", "~2 GB"], "correct_index": 2}}, {"id": "mobile-0180", "title": "The OTA Memory Budget", "topic": "compound-ai-systems", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain to the junior engineer, using napkin math, whether this update is safe to roll out from a memory perspective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Unsafe. The new 1.0 GB model added to the 1.2 GB base usage is 2.2 GB, which exceeds the 2.0 GB budget.", "Unsafe. A 1B parameter model requires 2.0 GB of memory (using 2 bytes/param), which is the entire budget.", "Safe. The memory increase is only 0.5 GB, bringing the new peak to 1.7 GB, which is under the 2.0 GB budget.", "Safe. The OS will use memory mapping, so the model's size on disk doesn't count against the app's RAM budget."], "correct_index": 2}}, {"id": "mobile-0181", "title": "The Frozen UI Watchdog", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is a standard, low-level timer mechanism used to proactively detect this frozen state and trigger a faster, more graceful recovery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A software watchdog timer", "Relying on the OS 'Application Not Responding' (ANR) timeout", "A network-based health check to a remote server", "Wrapping the inference call in a try/except block"], "correct_index": 0}}, {"id": "mobile-0184", "title": "The Camera Pre-processing Skew", "topic": "mlops-lifecycle", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the primary risk of this pipeline and calculate the approximate data reduction factor between a single training frame and a single serving frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A ~14x reduction. This is a typical and acceptable trade-off for mobile performance.", "A ~330x reduction. This skew is caused by forgetting to account for the 3 color channels in the serving image.", "A ~110x reduction. This causes training-serving skew because the on-device preprocessing artifacts are not in the training data.", "A ~880x reduction. This level of compression is too high and indicates a miscalculation in bit-to-byte conversion."], "correct_index": 2}}, {"id": "mobile-0185", "title": "The Two-Billion Dollar Keystroke", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why can Federated Learning be economically justified for keyboard prediction despite higher direct implementation costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The $10M/year cost increase is not justifiable as it triples the project's budget.", "FL saves significant network costs by not having to upload petabytes of user data.", "The potential cost of a data breach far exceeds the implementation cost difference.", "FL provides better model accuracy through on-device personalization."], "correct_index": 2}}, {"id": "mobile-0187", "title": "The Mobile Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a layer doing 50 GOps while reading 500 MB memory-bound on an A17 Pro, and what roofline math proves it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The ANE's 35 TOPS is the limiting factor.", "Memory-bound. The layer's arithmetic intensity is ~100 Ops/Byte, which is significantly lower than the A17 Pro's ridge point of ~683 Ops/Byte.", "Compute-bound. The layer's arithmetic intensity is ~100 Ops/Byte, which is high enough to saturate the processor.", "Neither. The workload is balanced because the amount of data and compute are both large."], "correct_index": 1}}, {"id": "mobile-0188", "title": "The Mobile Memory Chasm", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much slower is a random read from the phone's UFS flash storage compared to its main LPDDR5 DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash is ~1,000× slower than DRAM.", "Flash is ~10× slower than DRAM.", "They have similar latency, within 2-3× of each other.", "Flash is actually faster for sequential burst reads, making it superior for models."], "correct_index": 0}}, {"id": "mobile-0189", "title": "The Mobile KV-Cache Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory is required for a 24-layer, 16-head LLM with 4096 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~192 MB", "~384 MB", "~768 MB", "~96 MB"], "correct_index": 1}}, {"id": "mobile-0190", "title": "The TinyML vs Mobile Memory Arena", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can a 250 KB keyword model that fits a Cortex-M7 tensor arena still face memory constraints on a 12 GB phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile app can use all 12GB of RAM, making it over 48,000x larger. (Trap: Assuming no OS caps)", "The mobile app's budget is orders of magnitude larger (~12,000x), but it's higher-latency DRAM managed by an OS, unlike the microcontroller's dedicated high-speed SRAM.", "The memory is the same type (RAM), so the only difference is the amount available. (Trap: Missing SRAM vs DRAM difference)", "The mobile OS overhead consumes most of the RAM, so the actual available memory is similar to the TinyML device. (Trap: Numerically false)"], "correct_index": 1}}, {"id": "mobile-0191", "title": "The Quantization Memory Payoff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the final memory requirement for a 7B parameter model's weights after quantizing from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "3.5 GB", "7 GB", "700 MB"], "correct_index": 2}}, {"id": "mobile-0195", "title": "The SoC Power Draw", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When the model is actively processing audio, what is a realistic power consumption value for the phone's System-on-a-Chip (SoC)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~30 W", "~4 W", "~15 W", "~1.5 W"], "correct_index": 1}}, {"id": "mobile-0197", "title": "The Background Battery Drainer", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming negligible power consumption during sleep, what percentage of a 19Wh phone battery does a 0.5-second, 3W task every 10 seconds use in 24 hours?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Over 100%. The feature consumes 72Wh, which isn't possible.", "Around 3.2%. The feature is active for about 0.2 hours per day.", "Around 18.9%. The feature is active for 1.2 hours and consumes 3.6Wh.", "Around 6.3%. The duty cycle is 1/60, leading to 1.2Wh of consumption."], "correct_index": 2}}, {"id": "mobile-0198", "title": "The AR Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What duty cycle can a 5W AR feature sustain when idle power is 1W and passive cooling dissipates 3W?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["60%. The phone can dissipate 3W of the 5W active power.", "100%. The SoC can handle 5W without issue.", "40%. The excess heat generated is 2W, which is 40% of the active power.", "50%. The weighted average of active and idle power must equal the 3W dissipation rate."], "correct_index": 3}}, {"id": "mobile-0199", "title": "The OS Kill Switch: Mobile Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What maximum total app memory budget should you target on an 8 GB smartphone to avoid OS termination?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["6 GB", "8 GB", "2 GB", "500 MB"], "correct_index": 2}}, {"id": "mobile-0200", "title": "The On-Device RAG Budget", "topic": "compound-ai-systems", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory do the FP16 weights of a 1B parameter model require, and does it fit within a standard mobile app budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It needs 1 GB, so it fits easily.", "It needs 2 GB, which is much less than the 8 GB device RAM, so it fits easily.", "It needs 2 GB, which consumes the entire 25% app budget, making it too risky.", "It needs 4 GB, so it won't fit."], "correct_index": 2}}, {"id": "mobile-0201", "title": "The Watchdog's Power Tax: Monitoring & Observability", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy, in Watt-hours (Wh), consumed only by the watchdog process over a 24-hour period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12 Wh", "0.72 Wh", "0.12 Wh", "432 Wh"], "correct_index": 2}}, {"id": "mobile-0202", "title": "The Hidden Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single largest operational cost introduced by choosing the Federated Learning design at scale?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 0}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased on-device compute and battery drain.", "Server compute cost for aggregating model updates.", "Network communication overhead from gradient uploads.", "Storage costs for the global model on the server."], "correct_index": 2}}, {"id": "mobile-0203", "title": "The Economics of Privacy: Centralized vs. Federated", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What monthly cloud ingress cost results from 1M users uploading 5 MB of training data per day at $0.01 per GB?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 1}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$50", "$500", "$1,500", "$15,000"], "correct_index": 2}}, {"id": "mobile-0204", "title": "The Mobile Roofline Limit", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a layer with 100 Ops/Byte memory-bound or compute-bound on an NPU with 35 TOPS and 51.2 GB/s bandwidth?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 Ops/Byte; Compute-bound", "683 Ops/Byte; Compute-bound", "100 Ops/Byte; Memory-bound", "683 Ops/Byte; Memory-bound"], "correct_index": 2}}, {"id": "mobile-0205", "title": "The Mobile KV-Cache Memory Trap", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory does a 32-layer, 4096-hidden LLM need for a 4096-token context on mobile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 GB", "4 GB", "2 GB", "256 MB"], "correct_index": 2}}, {"id": "mobile-0206", "title": "The On-Device Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Explain whether the model's weights will fit into this budget using FP16 precision, and contrast this with using INT8 precision.", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both will fit easily within the 8 GB of device RAM.", "Neither will fit; the model is too large for on-device deployment.", "It will exceed the budget in FP16, but fit using INT8.", "Both require the same 1.5 GB of memory, so both will fit."], "correct_index": 2}}, {"id": "mobile-0207", "title": "The MobileNet Parameter Diet", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much does a 3x3 depthwise separable convolution reduce parameters for 128 input and 256 output channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No change in parameters, only latency improves", "~2x reduction", "~256x reduction", "~9x reduction"], "correct_index": 3}}, {"id": "mobile-0208", "title": "The Two Latencies of Generative AI", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What standard industry terms define these two critical performance metrics in generative models?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 0}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Initial Latency and Inference Speed", "Cold Start and Warm Read", "P50 Latency and P99 Latency", "Time to First Token (TTFT) and Time Per Output Token (TPOT)"], "correct_index": 3}}, {"id": "mobile-0212", "title": "The Watchdog's Battery Bill", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the total energy cost in Watt-hours (Wh) for watchdog recovery events over an 8-hour drive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5.33 Wh", "0.0028 Wh", "0.089 Wh", "5.0 W"], "correct_index": 2}}, {"id": "mobile-0213", "title": "The Federated Learning Cost Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a single training round, which of the following operations is the primary driver of the user's cost in terms of battery consumption and data-plan usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Local model training computation on the NPU (~1 Joule).", "Reading the local training data from UFS flash storage (~0.1 Joule).", "Transmitting the computed model update over the cellular network (~10 Joules).", "The overhead from the A/B testing framework that selected the user (~0.01 Joule)."], "correct_index": 2}}, {"id": "mobile-0215", "title": "The Mobile VFX Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of this layer, and is it compute-bound or memory-bound on an A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound; 1.4 Billion operations is a heavy workload that will saturate the ANE's compute units.", "Memory-bound; the AI is 100 Ops/Byte, which is less than the A17 Pro's ridge point.", "Memory-bound; the AI is 200 Ops/Byte, which is less than the A17 Pro's ridge point of ~683 Ops/Byte.", "Compute-bound; the AI is 200 Ops/Byte, and any AI over 100 is typically considered high enough to be compute-bound."], "correct_index": 2}}, {"id": "mobile-0216", "title": "The Mobile Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the memory requirements and calculate the model's final storage size in megabytes after full INT8 quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7.5 MB", "30 MB", "15 MB", "60 MB"], "correct_index": 2}}, {"id": "mobile-0217", "title": "The Depthwise Separable Dividend: Transformer Systems Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much computational reduction should replacing a standard 3x3 convolution with a depthwise separable convolution provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It provides no computational reduction, it only saves memory.", "A ~3x reduction.", "A ~9x reduction.", "A ~2x reduction."], "correct_index": 2}}, {"id": "mobile-0219", "title": "The App Memory Diet", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a reasonable rule-of-thumb memory budget for your entire feature, including the model and runtime activations, to avoid being terminated by the OS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["6 GB (Fails to account for OS footprint)", "8 GB (Total theoretical RAM)", "256 MB (Unnecessarily restrictive)", "2 GB"], "correct_index": 3}}, {"id": "mobile-0220", "title": "The 16ms UI Jank Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To avoid UI jank, what is the maximum number of tokens you can generate synchronously before you must implement a background-threaded generation strategy?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 1}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["32 tokens", "60 tokens", "8 tokens", "16 tokens"], "correct_index": 2}}, {"id": "mobile-0221", "title": "The 'Ambient Assistant' Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What percentage of the total battery capacity will this feature consume over a 24-hour period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~32%", "~39%", "~164%", "~320%"], "correct_index": 1}}, {"id": "mobile-0222", "title": "The Runaway Inference Battery Drain", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How would you calculate the approximate battery energy consumed by a single runaway inference that lasts for 60 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["300 Wh. (Calculated Joules but used Wh unit)", "5 Wh. (Used Watt rating directly as Wh)", "≈ 0.083 Wh. This is a significant battery drain for a single failed operation", "8.3 Wh. (Forgot to divide by full 3600)"], "correct_index": 2}}, {"id": "mobile-0224", "title": "The Battery Cost of Federated Personalization", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the total energy consumed by this 15-minute daily federated learning feature per user over a 30-day period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.75 Wh", "3.0 W", "22.5 Wh", "2160 Wh"], "correct_index": 2}}, {"id": "mobile-0226", "title": "The App Store Diet: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If you quantize the model's weights to INT8, what is the new size of the model on disk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 MB", "32 MB", "4 MB", "2 MB"], "correct_index": 2}}, {"id": "mobile-0227", "title": "The Mobile Roofline Riddle", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of the style transfer model, and is it memory-bound on this NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 Ops/Byte; Memory-bound, as its AI is far below the device's ridge point.", "140 Ops/Byte; Compute-bound, as this is a high operational intensity.", "684 Ops/Byte; Compute-bound, as the model's intensity will match the device's.", "17.5 Ops/Byte; Memory-bound, from incorrectly converting bytes to bits."], "correct_index": 0}}, {"id": "mobile-0228", "title": "The Mobile Memory Squeeze", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Does quantizing the 750M parameter FP16 model to INT8 allow it to fit within the 1 GB memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["FP16 is 750 MB, INT8 is 375 MB. Both fit.", "FP16 is 3.0 GB, INT8 is 1.5 GB. Neither fits.", "FP16 is 1.5 GB, INT8 is 750 MB. The INT8 model fits.", "FP16 is 1.5 GB, INT8 is 1.5 GB. There are no memory savings."], "correct_index": 2}}, {"id": "mobile-0231", "title": "NPU vs. Reality", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Given the A17 Pro's peak performance of 35 TOPS, what is the theoretical maximum frame rate (FPS) you could achieve, ignoring all memory, OS, and framework overhead?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20 FPS", "5 FPS", "500 FPS", "50 FPS"], "correct_index": 3}}, {"id": "mobile-0232", "title": "The Mobile Generative UI Latency Trap", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For time-to-last-result, is it faster to process 3 sequential 150ms edits or one 300ms batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["450ms (no-batch) vs. 450ms (batch); batching is always better.", "150ms (no-batch) vs. 300ms (batch); no-batching is better for throughput.", "450ms (no-batch) vs. 300ms (batch); no-batching provides a faster 150ms time-to-first-result.", "450ms (no-batch) vs. 225ms (batch); batching time scales linearly."], "correct_index": 2}}, {"id": "mobile-0233", "title": "The Drowsy Driver's Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy in Watt-hours (Wh) this feature will consume from the phone's battery over the course of an 8-hour road trip?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 Wh", "0.2 Wh", "1.6 Wh", "1.8 Wh"], "correct_index": 2}}, {"id": "mobile-0234", "title": "The 25% Mobile Memory Rule", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Before checking the model's specific memory footprint, what is the first-order, rule-of-thumb application memory budget you should recall for a typical high-end smartphone with 8GB of RAM?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 0}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 GB", "4 GB", "2 GB", "256 MB"], "correct_index": 2}}, {"id": "mobile-0236", "title": "The Million-Car Update", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When evaluating the Total Cost of Ownership (TCO) for this single experiment, which of the following costs should you identify as the most significant and immediate financial factor?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The $5,000 cloud compute cost to retrain the model.", "The $50,000 cellular data cost to download the 50MB model update.", "The $500,000 fleet-wide cellular data cost to download the 50MB model update.", "The $5,000,000 increased battery consumption from running a larger model."], "correct_index": 2}}, {"id": "mobile-0237", "title": "The Fleet-Level Cost of an 'Always-On' Mobile Feature", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the annual energy cost across a fleet of 1 million cars if the feature is used 1.5 hours per day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$186.15", "$186,150,000", "$186,150", "$2,978,400"], "correct_index": 2}}, {"id": "mobile-0239", "title": "The INT8 Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the memory savings in megabytes (MB) for the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["450 MB", "300 MB", "150 MB", "75 MB"], "correct_index": 2}}, {"id": "mobile-0240", "title": "The Depthwise Efficiency Factor", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate reduction in computational cost (FLOPs) you'd expect to see in those layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2-3x (Trap: Underestimating quadratic scaling)", "Roughly 8-9x", "Roughly 4-5x (Trap: Miscalculating K squared)", "It's the same, but memory is reduced (Trap: Confusing params with FLOPs)"], "correct_index": 1}}, {"id": "mobile-0241", "title": "The First-Word Latency Test", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which metric is most critical to minimize for this initial perception of responsiveness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT)", "Time To First Token (TTFT)", "Model Loading Time", "Total Generation Time for the sequence"], "correct_index": 1}}, {"id": "mobile-0244", "title": "The 7 Billion Parameter Car Crash", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the typical hardware specs for mobile-class systems, what is the most immediate, fundamental blocker for this plan?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU's TOPS limit will cap inference at 1 token/sec.", "The OTA download exceeds cellular bandwidth SLAs.", "The 7 GB INT8 footprint physically exceeds the system's available RAM.", "The context window will max out the KV cache in 2 turns."], "correct_index": 2}}, {"id": "mobile-0245", "title": "The OTA Storage Budget", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak storage is required during a rollback-safe OTA update from a 500 MB model to a 1.8x larger v2 model?", "chain_ids": ["mobile-chain-auto-001-05"], "chain_positions": {"mobile-chain-auto-001-05": 1}, "chain_tiers": {"mobile-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1600 MB. The final size is well within the limit.", "1700 MB. The peak is the initial size plus the old model being duplicated.", "2400 MB. The update will temporarily violate the 2 GB storage limit.", "1950 MB. The update is large but stays just within the limit."], "correct_index": 2}}, {"id": "mobile-0246", "title": "The Battery Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate power consumption of a modern smartphone's System-on-a-Chip (SoC) when it's actively running an ML workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~50 milliwatts", "~30 Watts", "~3–5 Watts", "~700 Watts"], "correct_index": 2}}, {"id": "mobile-0247", "title": "The Privacy vs. Battery-Life Tax", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which uses less energy for mobile A/B testing: 60s of 3W federated learning or a 3s, 5W cellular upload?", "chain_ids": ["mobile-chain-auto-secondary-008-03"], "chain_positions": {"mobile-chain-auto-secondary-008-03": 1}, "chain_tiers": {"mobile-chain-auto-secondary-008-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Cloud approach is much costlier (~1500 Joules), as uploading data is always expensive.", "B) Both approaches consume roughly the same amount of energy (~150-180 Joules).", "C) The FL approach is much costlier (~180 Joules vs ~15 Joules for the cloud).", "D) The energy cost cannot be compared, as one is compute (Watts) and the other is data (MB)."], "correct_index": 2}}, {"id": "mobile-0248", "title": "The A17 NPU Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is an A17 Pro layer with 50 GOps and 200 MB of memory traffic memory-bound or compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the required 50 Giga-ops exceeds the capacity of the NPU.", "Memory-bound, because the memory bandwidth number (51.2) is much smaller than the compute number (35,000).", "Memory-bound, because its arithmetic intensity (250 Ops/Byte) is below the A17's ridge point (~683 Ops/Byte).", "Compute-bound, because its arithmetic intensity (250 Ops/Byte) is high, indicating a heavy compute load."], "correct_index": 2}}, {"id": "mobile-0249", "title": "The On-Device Memory Diet: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the memory requirements for both precisions and calculate the total memory savings in gigabytes?", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 0}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "3.5 GB", "7 GB", "21 GB"], "correct_index": 2}}, {"id": "mobile-0250", "title": "The MobileNet Multiplier: Transformer Systems Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary benefit you recall this change provides?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly improves model accuracy.", "It provides a >100x reduction in computation.", "It reduces memory by 4x by using INT8 precision.", "It provides a roughly 9x reduction in computation and parameters."], "correct_index": 3}}, {"id": "mobile-0252", "title": "The Real-Time Translation Jank", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the model's token generation latency to the UI frame budget to determine the source of the jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30 tokens/sec is high throughput; the model isn't the bottleneck.", "The 150ms TTFT causes a 9-frame initial drop; subsequent tokens are fast enough.", "At ~33.3ms per token, synchronous inference blocks the main UI thread for over two 16ms frames.", "The UI thread is blocked for 16ms, creating a 50% inference timeout."], "correct_index": 2}}, {"id": "mobile-0253", "title": "The Background Battery Drain Dilemma", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can you calculate the total energy this feature consumes from the battery over one hour of continuous operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.0 Wh", "0.1 W", "0.1 Wh", "10.0 Wh"], "correct_index": 2}}, {"id": "mobile-0254", "title": "The OTA Update Space Crunch", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much free space remains during an atomic OTA update when the framework, v1 model, compressed v2, and uncompressed v2 coexist?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["648 MB", "150 MB", "198 MB", "798 MB"], "correct_index": 2}}, {"id": "mobile-0255", "title": "The Mobile Power Chasm", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate active-inference-to-deep-sleep power ratio for a mobile SoC?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100x", "1,000x", "500,000x", "10x"], "correct_index": 2}}, {"id": "mobile-0256", "title": "The Federated Learning Battery Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much annual energy does one 5-minute, 3W federated learning round per day consume on a user's phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["15 Wh. (Trap: 3W * 5 min)", "1,095 Wh. (Trap: 3W * 365 without hour conversion)", "91.25 Wh.", "2190 Wh. (Trap: Wrong unit base entirely)"], "correct_index": 2}}, {"id": "mobile-0257", "title": "The Mobile NPU Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 2 GOps, 10 MB object detection inference memory-bound on an A17 Pro, and what throughput does that imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity is over 200,000,000 Ops/Byte, which easily saturates the ANE.", "Memory-bound, because its Arithmetic Intensity of 200 Ops/Byte is less than the A17's ridge point of ~683 Ops/Byte.", "Compute-bound, because 35 TOPS is a massive amount of performance, and the model is relatively small.", "It's impossible to tell without knowing the power efficiency in TOPS/W for this specific model."], "correct_index": 1}}, {"id": "mobile-0258", "title": "The Depthwise Separable Efficiency Gain", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate reduction in computational cost (FLOPs) you should expect from this change?", "chain_ids": ["mobile-chain-auto-secondary-010-09"], "chain_positions": {"mobile-chain-auto-secondary-010-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2-3× reduction (assuming standard grouped convolution scaling)", "No significant reduction (assuming memory bounds negate compute savings)", "~8-9× reduction", "~27× reduction (incorrectly cubing the 3x3 kernel dimension)"], "correct_index": 2}}, {"id": "mobile-0259", "title": "The 'Instant Reply' Metric", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For the user to perceive the suggestion as 'instantaneous', what key metric must be minimized, and what is its approximate target value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT), targeting ~50ms", "Frame Rendering Latency, targeting ~16ms", "Time To First Token (TTFT), targeting ~100ms", "Throughput, targeting >20 tokens/second"], "correct_index": 2}}, {"id": "mobile-0260", "title": "The On-Device Assistant's First Word", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 7B INT8 assistant load its 7 GB weights over 77 GB/s LPDDR5X within a 200 ms TTFT budget?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 1}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~182 ms. Yes, but it's very close to the budget limit.", "~727 ms. No, this is far too slow and will feel laggy.", "~91 ms. Yes, this is well within the 200ms budget.", "~11 ms. Yes, it's extremely fast, leaving plenty of budget."], "correct_index": 2}}, {"id": "mobile-0264", "title": "The Mobile Style Transfer Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is a mobile style transfer CNN with arithmetic intensity below the A17 Pro ridge point memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound; the model's Arithmetic Intensity is ~288 Ops/Byte, and since this is a high number, the compute units must be the bottleneck.", "Memory-bound; the model's Arithmetic Intensity is 0.2 Ops/Byte, which is far too low.", "Memory-bound; the model's Arithmetic Intensity is ~288 Ops/Byte, which is below the A17 Pro's hardware ridge point of ~683 Ops/Byte.", "Compute-bound; the model requires 0.005 Bytes/Op, meaning very little data is needed per operation, so the bottleneck must be compute speed."], "correct_index": 2}}, {"id": "mobile-0265", "title": "The INT8 Memory Halving", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the final memory footprint when quantizing the 50 million parameter model from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["25 MB, a 4x reduction.", "12.5 MB, an 8x reduction.", "50 MB, a 2x reduction.", "100 MB, no change."], "correct_index": 2}}, {"id": "mobile-0266", "title": "The Activation Memory Footprint", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Assuming the model processes one frame at a time (batch size 1) and uses half-precision floating point (FP16), what is the memory required to store this single activation map?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 KB", "~512 KB", "~1 MB", "~10 MB"], "correct_index": 2}}, {"id": "mobile-0267", "title": "The Voice Assistant's First Word", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the user perceives the assistant as immediately responsive after they ask a question, which of the following latency metrics is the most critical to minimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT)", "Total generation time for the full response", "Time to First Token (TTFT)", "NPU delegation ratio"], "correct_index": 2}}, {"id": "mobile-0269", "title": "The Dashcam Drain Dilemma", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the average power consumption and estimate how long the battery will last running this feature?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4.6 hours", "~9.2 hours", "~18.4 hours", "~23.1 hours"], "correct_index": 2}}, {"id": "mobile-0271", "title": "The Federated Energy Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why can Federated Learning consume more total system energy than centralized training at fleet scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because a 700W datacenter GPU uses far more power than a 5W phone.", "They are roughly equal; the powerful server's work balances out the distributed work of many weak devices.", "Federated Learning, because the aggregate energy consumed by millions of phones exceeds the server's energy budget.", "Centralized, because transmitting raw data from millions of devices to the server consumes the most energy."], "correct_index": 2}}, {"id": "mobile-0274", "title": "The Depthwise Separable Advantage", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary advantage of making this change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly increases model accuracy by adding more layers.", "It only reduces model size (parameters), but compute cost (FLOPs) remains the same.", "It significantly reduces both computation (FLOPs) and parameters (model size).", "It enables models to be quantized to INT8, which is not possible with standard convolutions."], "correct_index": 2}}, {"id": "mobile-0276", "title": "The Jank Budget Fallacy", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can either the single-token or batched approach stream tokens to the UI without causing jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The batched approach is better because the throughput is higher.", "Both approaches are effectively the same since the average time per token is 30ms.", "Neither approach works; both violate the 16ms deadline, and batching increases perceived latency.", "The single-token approach works; 30ms is fast enough for a mobile device."], "correct_index": 2}}, {"id": "mobile-0279", "title": "The Federated Learning Litmus Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason to use Federated Learning for driver-drowsiness video data instead of central collection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce network bandwidth costs from uploading terabytes of video data.", "To achieve higher final model accuracy compared to centralized training.", "To protect user privacy by not collecting or centralizing raw video data.", "To simplify the A/B testing process for new model architectures across the fleet."], "correct_index": 2}}, {"id": "mobile-0280", "title": "The Economics of On-Device Learning: Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the daily data transfer costs for centralized versus federated training across 1M mobile users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized costs $100,000/day; Federated costs $400/day.", "Centralized costs $100/day; Federated costs $4/day.", "Centralized costs $100,000/day; Federated costs $4,000/day.", "Centralized costs $102,400/day; Federated costs $4,096/day."], "correct_index": 2}}, {"id": "mobile-0284", "title": "The Mobile UI Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To maintain a smooth 60 FPS user experience and avoid UI 'jank', what is the approximate latency budget for a single inference call that needs to run on the main thread before the next frame is drawn?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 0}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1 ms", "~16 ms", "~33 ms", "~100 ms"], "correct_index": 1}}, {"id": "mobile-0285", "title": "The Real-Time Dashcam Dilemma", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many objects can your system theoretically detect in a single frame before dropping below the real-time deadline, assuming 2 Giga-Ops per object?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["17 objects", "280,000 objects", "280 objects", "1 object"], "correct_index": 2}}, {"id": "mobile-0287", "title": "The Autopilot OTA Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 60M-parameter FP16 model fit within a strict 150 MB cellular OTA payload limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the payload requires 240 MB (FP32), which exceeds the 150 MB limit.", "Yes, the payload requires 60 MB (INT8), well under the limit.", "Yes, the payload requires roughly 130 MB (FP16 + metadata), fitting the limit.", "No, parameter count alone cannot determine the binary payload size."], "correct_index": 2}}, {"id": "mobile-0288", "title": "The Drowsy Driver's Dilemma", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following factors is the MOST critical lever for managing the total battery energy consumed by this feature over the entire shift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The peak power draw of the inference (Pinference)", "The amount of RAM the model's activations consume", "The duty cycle of the inference process", "The speed of the phone's flash storage (UFS 4.0)"], "correct_index": 2}}, {"id": "mobile-0289", "title": "The Battery Drain Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Compare the daily battery energy cost of Model A to Model B.", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B consumes about 0.0042 Wh per day, roughly 1.5 times more than Model A's 0.0028 Wh.", "Model B consumes 3 Wh per day, while Model A consumes 0.5 Wh. It's a 6x difference.", "Model B consumes 1 Wh per day (60 Ws / 60), which is significantly more than Model A.", "Model B consumes 600 Wh per day, making it completely infeasible for a mobile device."], "correct_index": 0}}, {"id": "mobile-0290", "title": "The Mobile Roofline Dilemma: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the model's Arithmetic Intensity and determine if the workload is compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 OPS/Byte; Compute-bound", "683 OPS/Byte; Compute-bound", "140 OPS/Byte; Memory-bound", "683 OPS/Byte; Memory-bound"], "correct_index": 2}}, {"id": "mobile-0291", "title": "The INT8 Memory Payoff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the impact on the model's static memory footprint and calculate the new size in megabytes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 MB", "7 MB", "3.5 MB", "28 MB"], "correct_index": 1}}, {"id": "mobile-0292", "title": "The Cost of Depthwise Separable Convolutions", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a typical layer using a 3x3 kernel, approximately how much computationally cheaper is it to replace a standard convolution with a 3x3 depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x cheaper", "Roughly the same cost, it just saves memory", "About 9x cheaper", "About 128x cheaper (scales with channels)"], "correct_index": 2}}, {"id": "mobile-0294", "title": "The Real-Time Batching Trap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is batching real-time dashcam frames counter-productive when camera frames arrive every 33.3ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It fails. The latency for the first frame is 73.3ms because it waits for the second frame.", "It works. The average latency is 20ms (40ms/2), well under the 33.3ms deadline.", "It fails. The 40ms batch processing time exceeds the 33.3ms deadline.", "It works. Two frames give 66.6ms budget, and 40ms processing fits."], "correct_index": 0}}, {"id": "mobile-0297", "title": "The Energy Cost of Precision", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much less energy does a single INT8 arithmetic operation consume compared to a single FP32 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x", "~4x", "~18x", "~100x"], "correct_index": 2}}, {"id": "mobile-0299", "title": "The A17 Pro's Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is this specific layer compute-bound or memory-bound based on its arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity of 350 Ops/Byte is much greater than the A17's ridge point of ~0.68 Ops/Byte.", "Compute-bound, because the workload's Arithmetic Intensity (350 Ops/Byte) is less than the hardware's ridge point (~684 Ops/Byte).", "Memory-bound, because the workload's Arithmetic Intensity (350 Ops/Byte) is less than the A17 Pro's ridge point (~684 Ops/Byte).", "Memory-bound, because 2 GB/s is a large amount of data, which always indicates a memory bottleneck on mobile."], "correct_index": 2}}, {"id": "mobile-0300", "title": "The App Size Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the model's memory footprint in its original FP16 format, and what is its footprint after full INT8 quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["600 MB (FP16) vs. 150 MB (INT8)", "300 MB (FP16) vs. 150 MB (INT8)", "2.4 GB (FP16) vs. 1.2 GB (INT8)", "150 MB (FP16) vs. 75 MB (INT8)"], "correct_index": 1}}, {"id": "mobile-0303", "title": "The Live Caption Queueing Delay", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average queue length results when 30 FPS frames take 25ms each to process on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.0 frames", "0.75 frames", "0 frames, because service time is less than the arrival interval", "2.25 frames"], "correct_index": 3}}, {"id": "mobile-0304", "title": "The 'Always-On' Battery Drain Dilemma", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much energy does an always-on keyword feature consume over 24 hours with 100ms active at 3W each second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["72 Wh (Confuses peak power with average power)", "7.2 Wh (Ignores the idle power consumption)", "~8.3 Wh", "~29.8 Wh (Incorrectly converts Joules to Watt-hours)"], "correct_index": 2}}, {"id": "mobile-0305", "title": "The OTA Budget Bust", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory will the model parameters alone require for inference, assuming it's stored in FP16 precision?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB. (Trap: Assuming INT8 weights at 1 byte/param)", "112 GB. (Trap: Assuming 16 bytes/param for full training states)", "14 GB.", "28 GB. (Trap: Assuming FP32 inference at 4 bytes/param)"], "correct_index": 2}}, {"id": "mobile-0306", "title": "The Energy Cost of Data Movement", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much more energy does a DRAM access cost than one FP16 operation on a mobile ML accelerator?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~1x (Trap: assuming compute and data fetch are roughly equivalent).", "B) ~10x (Trap: assuming standard software memory hierarchy costs).", "C) ~100x (Trap: underestimating the physical trace distance to LPDDR).", "D) ~580x (Correct)."], "correct_index": 3}}, {"id": "mobile-0307", "title": "The On-Device Battery Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What percentage of a full battery charge is consumed by a single FL training round?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 1}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10.0%", "5.0%", "2.5%", "1.0%"], "correct_index": 1}}, {"id": "mobile-0309", "title": "The 25% Rule for Mobile Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What ML feature memory budget follows from the 25% rule on an 8 GB flagship smartphone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["6 GB", "4 GB", "2 GB", "512 MB"], "correct_index": 2}}, {"id": "mobile-0310", "title": "The Mobile Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth, 'buttery' 60 FPS user experience, what is the approximate latency budget for your entire ML inference pipeline per frame before a user will perceive 'jank'?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 0}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100ms (Typical cloud service P99 latency)", "~33ms (Standard 30 FPS video deadline)", "~16ms (60 FPS 'jank' deadline)", "~1ms (TinyML interrupt latency)"], "correct_index": 2}}, {"id": "mobile-0311", "title": "The On-Device Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What token throughput results when a mobile generative model's 25ms TPOT is slower than the UI frame deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~63 tokens/sec. The throughput is determined by the 16.67ms UI refresh deadline.", "~24 tokens/sec. This is derived from the sum of the latencies (16.67ms + 25ms).", "40 tokens/sec. The throughput is bottlenecked by the 25ms token generation time.", "~111 tokens/sec. This is derived from the difference in latencies (25ms - 16.67ms)."], "correct_index": 2}}, {"id": "mobile-0314", "title": "The Core Premise of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason you'd state for using Federated Learning in this mobile context?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It produces more accurate models by specializing on each user's data.", "It keeps raw user data on the device, enhancing user privacy.", "It reduces network bandwidth costs by sending small gradient updates instead of large datasets.", "It allows for faster overall model training compared to centralized methods."], "correct_index": 1}}, {"id": "mobile-0315", "title": "The Federated vs. Centralized Data Cost", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What daily centralized upload cost results from 1M users sending 10 MB each at $0.09 per GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$900,000 per day", "$90,000 per day", "$900 per day", "$9 per day"], "correct_index": 2}}, {"id": "mobile-0316", "title": "The AR Filter's Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 70 GOPS, 200 MB convolution layer on the Neural Engine compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity (350 Ops/Byte) is less than the A17 Pro's Ridge Point (~683 Ops/Byte).", "Memory-bound, because its Arithmetic Intensity is ~0.0028 Bytes/Op, which is very low.", "Compute-bound, because 35 TOPS is a very high compute capability, so the layer will always be limited by the accelerator's speed.", "Memory-bound, because its Arithmetic Intensity (350 Ops/Byte) is less than the A17 Pro's Ridge Point (~683 Ops/Byte)."], "correct_index": 3}}, {"id": "mobile-0317", "title": "The App Memory Budget: Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What maximum ML feature memory budget is reasonable on a smartphone with 8 GB of RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8.0 GB (100% Allocation)", "4.0 GB (50% Allocation)", "2.0 GB (25% Allocation)", "0.5 GB (6% Allocation)"], "correct_index": 2}}, {"id": "mobile-0319", "title": "The On-Device Reader's Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What minimum TPOT is imposed by reading 1 GB of INT8 LLM weights over 77 GB/s mobile memory bandwidth?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 0}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~104 ms", "~0.04 ms", "~13 ms", "~1.3 ms"], "correct_index": 2}}, {"id": "mobile-0320", "title": "The Dashcam's Duty Cycle", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the average power consumption of a dashcam pipeline that runs at 5W for 1s and sleeps at 100mW for 9s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.50 W", "2.55 W", "0.59 W", "4.51 W"], "correct_index": 2}}, {"id": "mobile-0321", "title": "The Cellular Data Bill Shock", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming these updates happen over cellular networks, how much data must be delivered for a 10% rollout of a 75 MB model to 1M active users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["750 GB", "7.5 TB", "75 TB", "7.5 GB"], "correct_index": 1}}, {"id": "mobile-0322", "title": "The Mobile Memory Budget", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To avoid having your app terminated by the operating system, what is the generally accepted maximum memory budget for a single application relative to the total device RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75% — The OS allocates most of the memory to the foreground app.", "50% — The memory is split evenly between the app and the OS.", "25% — A conservative budget to ensure stability alongside the OS and other processes.", "5% — The OS is highly restrictive, leaving very little for any single app."], "correct_index": 2}}, {"id": "mobile-0323", "title": "The Federated Learning Subsidy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do centralized and federated personalization data subsidies compare for 1M keyboard users at $10 per GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The costs are identical ($100,000/day), so the choice depends on privacy, not economics.", "Centralized costs $10,000/day and Federated costs $1,000/day; Centralized is only 10x more expensive.", "Centralized costs $100,000/day and Federated costs $1,000/day; Federated is 100x cheaper.", "The costs are negligible for both, totaling less than $100 per day."], "correct_index": 2}}, {"id": "mobile-0326", "title": "The UI Jank Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth, 'buttery' user experience and avoid visual stutter (UI jank), what is the generally accepted maximum latency budget for the model to process a single frame?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 0}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100ms (Cloud P99 Latency)", "33ms (30 FPS Video Deadline)", "16ms (60 FPS UI Jank Budget)", "1ms (TinyML Interrupt Budget)"], "correct_index": 2}}, {"id": "mobile-0327", "title": "The On-Device Autocomplete Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can autocomplete keep up when each 6-token suggestion takes 450ms but requests arrive every 400ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it can keep up. The decoding time is only 250ms, which is less than the 400ms arrival time.", "Yes, it can keep up. The first token appears in 200ms, which is fast enough to feel responsive.", "No, it cannot keep up. The total service time is 450ms per request, but new requests arrive every 400ms.", "It depends on the priority of the UI thread."], "correct_index": 2}}, {"id": "mobile-0328", "title": "The Driver Drowsiness Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy (in Watt-hours) this feature consumes over an 8-hour driving shift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40.0 Wh", "4.0 Wh", "4.36 Wh", "0.55 Wh"], "correct_index": 2}}, {"id": "mobile-0329", "title": "The Energy Cost of Privacy", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental physical reason why sending small, trained model updates from the device is often more energy-efficient than uploading the raw user data to a central server?", "chain_ids": ["mobile-chain-auto-secondary-008-03"], "chain_positions": {"mobile-chain-auto-secondary-008-03": 0}, "chain_tiers": {"mobile-chain-auto-secondary-008-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cellular data plans are expensive, so it primarily saves the user money.", "Central servers have more powerful GPUs, leading to faster training overall.", "Wireless data transmission is significantly more energy-intensive per byte than on-chip computation.", "On-device computation is too slow, so uploading is the only realistic option for training."], "correct_index": 2}}, {"id": "mobile-0330", "title": "The Mobile TOPS Illusion", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does a 10 GOps AR filter reading 200 MB fail to achieve the A17 Pro's advertised peak TOPS?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 1}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound, so its latency is ~0.28 ms.", "The model is memory-bound because its AI is less than the hardware's Ridge Point.", "The model is compute-bound because its AI is high.", "The model is memory-bound, so its latency is determined by compute."], "correct_index": 1}}, {"id": "mobile-0331", "title": "The Depthwise Dividend: Attention Scaling & Variants", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate reduction in computational cost (FLOPs) you would expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x", "~9x", "~28x", "~256x"], "correct_index": 1}}, {"id": "mobile-0334", "title": "The Background Battery Killer", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming a standard 4000 mAh (3.8V) battery, what percentage of the battery will this feature consume in 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3%", "~11% (Ignores 24-hour accumulated idle drain)", "~14%", "~40%"], "correct_index": 2}}, {"id": "mobile-0335", "title": "The Battery Tax of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the most critical resource consumed during on-device training that risks user churn?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 0}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Network bandwidth for model updates.", "On-device flash storage for the model.", "User battery life.", "A 1.1% RAM reduction causing memory swapping."], "correct_index": 2}}, {"id": "mobile-0336", "title": "The Billion-Dollar Fleet Update", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary economic argument for using Federated Learning, and what is the data transfer cost for the centralized approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Proposal A is cheaper. The battery drain cost for Proposal B is over $700k because the energy calculation was not converted from minutes to hours.", "Proposal B is significantly cheaper (by over $60k), as the only cost is the user battery impact; the server cost is negligible.", "Proposal B is cheaper in the long run (saving ~$19,100/year in recurring costs), despite higher Year 1 CapEx. The cloud training cost for A is significantly higher than the combined FL server and fleet-wide battery 'churn' cost for B.", "Proposal A is cheaper by ~$58,000. The Federated Learning 'churn cost' alone is higher than the entire cloud budget."], "correct_index": 2}}, {"id": "mobile-0337", "title": "The Style Transfer Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 50 GOps layer moving 200 MB memory-bound or compute-bound on an A17 Pro (35 TOPS, 51.2 GB/s), and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the layer's Arithmetic Intensity (250 Ops/Byte) is high.", "Memory-bound, because its Arithmetic Intensity (4 Ops/Byte) is far below the hardware ridge point.", "Memory-bound, because its Arithmetic Intensity (250 Ops/Byte) is below the A17 Pro's ridge point (~683 Ops/Byte).", "Compute-bound, because 50 Giga-operations will saturate the A17's 35 TOPS compute capacity."], "correct_index": 2}}, {"id": "mobile-0338", "title": "The Mobile Memory Wall: Attention Scaling & Variants", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much memory is needed to load the FP16 weights of a 7B-parameter model for mobile inference?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["112 GB", "28 GB", "14 GB", "7 GB"], "correct_index": 2}}, {"id": "mobile-0339", "title": "The UI Jank Budget: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth user experience without causing UI 'jank', what is the maximum latency budget your model's inference can consume per frame on a typical 60Hz smartphone display?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "16 ms", "1 ms"], "correct_index": 2}}, {"id": "mobile-0341", "title": "The Dashcam Battery Drain Dilemma: Duty Cycling & Energy Harvesting", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much energy does the model consume during a 30-minute commute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.0 Wh", "30 Wh", "0.5 Wh", "120 Wh"], "correct_index": 2}}, {"id": "mobile-0342", "title": "The RAG Update Bill", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How large is a monthly OTA embedding update for 500,000 locations with 768-dim FP16 vectors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~96 MB", "~384 MB", "~768 MB", "~1.54 GB"], "correct_index": 2}}, {"id": "mobile-0343", "title": "The Mobile Energy Culprit", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which part of a background mobile neural network usually dominates energy use at the hardware level?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Executing the FP16/INT8 multiply-accumulate (MAC) operations in the NPU.", "Reading sensor data from the phone's accelerometer.", "Moving model weights and activations from system DRAM to the NPU's local memory.", "Writing inference results to the app's log file on flash storage."], "correct_index": 2}}, {"id": "mobile-0345", "title": "The Mobile Roofline Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 10 GOps, 50 MB camera-filter layer memory-bound on an A17 Pro, and what roofline comparison shows it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the layer's Arithmetic Intensity is 200 Ops/Byte, which easily saturates the 35 TOPS ANE.", "Memory-bound, because the layer's Arithmetic Intensity (200 Ops/Byte) is less than the A17's Ridge Point (~683 Ops/Byte).", "Compute-bound, because 35 TOPS is a massive amount of performance, meaning compute limits execution.", "Memory-bound, because the Arithmetic Intensity is ~488 Ops/Byte."], "correct_index": 1}}, {"id": "mobile-0347", "title": "The Predictive Text Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Does the model meet the 500ms latency deadline for generating exactly 5 tokens?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["400ms. It meets the deadline with 100ms to spare.", "250ms. It meets the deadline with 250ms to spare.", "350ms. It meets the deadline with 150ms to spare.", "750ms. It fails to meet the deadline."], "correct_index": 2}}, {"id": "mobile-0348", "title": "The 'Magic Compose' Battery Budget", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total energy does the feature consume over a 16-hour active day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["48 Wh", "0.3 Wh", "4.8 Wh", "3.0 Wh"], "correct_index": 2}}, {"id": "mobile-0349", "title": "The Battery Price of Privacy", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical total power draw of a modern smartphone SoC when it's actively running an ML workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 - 50 milliwatts", "30 - 60 Watts", "3 - 5 Watts", "700 Watts"], "correct_index": 2}}, {"id": "mobile-0353", "title": "The Smart Keyboard Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Using the hardware constants for an Apple A17 Pro, what is the maximum theoretical size (in parameters) of an INT8 model that can meet this budget for generating one token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.75 Billion parameters", "3500 Billion parameters", "1750 Billion parameters", "17.5 Trillion parameters"], "correct_index": 2}}, {"id": "mobile-0354", "title": "The Airport Parking Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total energy, in Watt-hours, will this feature consume from the car's battery over a continuous 24-hour period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["96 Wh", "1152 Wh", "0.32 Wh", "19.2 Wh"], "correct_index": 2}}, {"id": "mobile-0356", "title": "The Core Benefit of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary benefit of using Federated Learning in this mobile context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It sends 1 TB of user data to a central server to be trained more efficiently on a large GPU cluster.", "It allows model training on local user data without the raw data (0 MB) ever leaving the device.", "It makes model inference faster on the mobile device by compressing the model before deployment.", "It encrypts 1 TB of user data before sending it to the server, where it is decrypted for training."], "correct_index": 1}}, {"id": "mobile-0357", "title": "The Federated Economics Trade-Off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What monthly data volume does centralized collection generate for 100,000 users uploading 50 MB per day for 30 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 TB", "150 GB", "150 TB", "1.5 GB"], "correct_index": 2}}, {"id": "mobile-0358", "title": "The Mobile Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the A17 Pro ridge point, and what does it say about when a workload becomes compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.46 Bytes/Op; your model must use less than this per operation to be efficient.", "~7 TOPS/W; this is the power efficiency you can expect from the chip.", "~684 Ops/Byte; a model's Arithmetic Intensity must exceed this to be compute-bound.", "~0.68 Ops/Byte; you only need one operation per byte to be compute-bound."], "correct_index": 2}}, {"id": "mobile-0360", "title": "The Jank Budget: Batching vs. Latency", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a user's request is the first to enter an empty batch, what is their effective Time to First Token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12ms", "15ms", "25ms", "27ms"], "correct_index": 3}}, {"id": "mobile-0362", "title": "The OTA Double-Storage Tax", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total storage footprint required during this transition and what is the final number?", "chain_ids": ["mobile-chain-auto-001-06"], "chain_positions": {"mobile-chain-auto-001-06": 1}, "chain_tiers": {"mobile-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 MB", "150 MB", "200 MB", "250 MB"], "correct_index": 3}}, {"id": "mobile-0363", "title": "The Federated Learning Cost Advantage", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the primary economic advantage of using Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Faster model training due to massively parallel on-device computation.", "Reduced cloud storage and network egress costs from not uploading raw user data.", "Lower initial engineering cost because Federated Learning frameworks are open source.", "Elimination of all server-side computation costs."], "correct_index": 1}}, {"id": "mobile-0364", "title": "The MobileNet Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of this depth-wise convolution, and why is it memory-bound?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["AI is ~25,175 Ops/Byte; it's compute-bound.", "AI is ~9 Ops/Byte; it's compute-bound because it involves millions of operations.", "AI is ~4.5 Ops/Byte; it's memory-bound.", "AI is ~9 Ops/Byte; it's memory-bound because this is far below the A17's ridge point of ~683 Ops/Byte."], "correct_index": 3}}, {"id": "mobile-0366", "title": "The Real-Time Code Completion Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 50ms TPOT phone NPU meet 15 tokens/sec for one code-completion user, and what is the maximum number of parallel sessions it can support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it fails the spec because the NPU's 50ms latency is a larger number than the 15 tokens/sec requirement.", "Yes, it meets the spec. It can support any number of sessions as long as their total requested tokens are less than 20 per second.", "Yes, it meets the single-user spec. However, only 1 session can be supported because adding a second increases the effective latency to 100ms, violating the 66.7ms deadline.", "Yes, it meets the spec. It can support up to 3 concurrent sessions."], "correct_index": 2}}, {"id": "mobile-0367", "title": "The Dashcam Battery Drain: Duty Cycling & Energy Harvesting", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a user drives for one hour, how much total energy does your feature consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.0 Wh", "0.6 Wh", "1.0 Wh", "1.0 W"], "correct_index": 2}}, {"id": "mobile-0368", "title": "The OTA Battery Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What fraction of a phone battery is used to download a 200 MB OTA model over cellular at 3W and 10 MB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.7%", "~3.15%", "~0.09%", "~0.03%"], "correct_index": 2}}, {"id": "mobile-0370", "title": "The On-Device Training Battery Tax", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy consumed over a 4-week test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3 Wh", "12 W", "12 Wh", "20 Wh"], "correct_index": 2}}, {"id": "mobile-0371", "title": "The Mobile Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the roofline ridge point represent, and what is it in operations per byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.68 Ops/Byte. (Forgot 1000x difference)", "7 TOPS/W. (Power efficiency, not ridge point)", "~684 Ops/Byte. Workloads must exceed this Arithmetic Intensity to be compute-bound.", "~68 Ops/Byte. (Misplaced decimal)"], "correct_index": 2}}, {"id": "mobile-0372", "title": "The Real-Time Keyboard Assistant", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the critical user experience advantage of using continuous batching in this scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It maximizes overall tokens per second (TPOT) across all users.", "It reduces the model's on-chip memory footprint.", "It reduces the Time To First Token (TTFT) to avoid UI lag.", "It lowers total power consumption by requiring fewer CPU cycles for scheduling."], "correct_index": 2}}, {"id": "mobile-0373", "title": "The Unstable Queue: Mobile Latency Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the end-to-end latency for the fifth audio chunk when chunks arrive every 100ms but processing takes 120ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["120 ms", "140 ms", "200 ms", "600 ms"], "correct_index": 2}}, {"id": "mobile-0374", "title": "The Background Service Battery Killer", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a user receives one message every 50 seconds on average, what is the *average* power consumption of your feature over time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20.0 mW", "40.0 mW", "39.8 mW", "1010.0 mW"], "correct_index": 2}}, {"id": "mobile-0375", "title": "The Mobile OTA Update Budget", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak storage is needed for a fail-safe 10 MB patch update of a 100 MB mobile model?", "chain_ids": ["mobile-chain-auto-001-10"], "chain_positions": {"mobile-chain-auto-001-10": 1}, "chain_tiers": {"mobile-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 MB (Assumes a risky in-place patch with no overhead)", "110 MB (Forgets a new file is created, just adds patch to original size)", "200 MB (Forgets to include the storage for the patch file itself)", "210 MB (Correctly accounts for original, patch, and new files)"], "correct_index": 3}}, {"id": "mobile-0376", "title": "The Federated Learning Bandwidth Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do centralized and Federated Learning daily bandwidth costs compare for 10M keyboard users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: $500/day, Federated: $5,000/day. Federated is 10x cheaper.", "Centralized: $50/day, Federated: $500/day. Federated is 10x more expensive.", "Centralized: $5/day, Federated: $500/day. Federated is 100x more expensive.", "Centralized: $5/day, Federated: $5/day. The costs are identical."], "correct_index": 2}}, {"id": "mobile-0377", "title": "The Mobile Roofline Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 50 GOPS, 100 MB convolution layer compute-bound or memory-bound based on roofline math?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because its arithmetic intensity (500 Ops/Byte) is less than the device's ridge point (~683 Ops/Byte).", "Compute-bound, because its arithmetic intensity (500 Ops/Byte) is less than the device's ridge point (~683 Ops/Byte).", "Compute-bound, because its arithmetic intensity is 5,000 Ops/Byte, which is greater than the device's ridge point.", "Compute-bound, because 35 TOPS is a very high peak value, so the workload is likely limited by compute."], "correct_index": 0}}, {"id": "mobile-0383", "title": "The Federated Learning Energy Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What total daily energy does 20 seconds of 4W federated training consume across 1M phones in kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.05 kWh", "~22,222 kWh", "~22.2 kWh", "~167 kWh"], "correct_index": 2}}, {"id": "mobile-0384", "title": "The Mobile NPU Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the A17 Pro roofline ridge point for 35 TOPS compute and 51.2 GB/s memory bandwidth?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.68 Ops/Byte. This is the arithmetic intensity.", "~0.0014 Bytes/Op. This is the memory required per operation.", "~684 Ops/Byte. This is the minimum arithmetic intensity required to be compute-bound.", "~5468 Ops/Byte. This is the ridge point, calculated by converting bytes to bits."], "correct_index": 2}}, {"id": "mobile-0385", "title": "The Mobile 'Jank' Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the UI remains perfectly smooth and never 'stutters', what is the approximate latency budget your model inference must meet for a 60Hz display?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "16 ms", "1 ms"], "correct_index": 2}}, {"id": "mobile-0386", "title": "The 'Instant' Assistant's Waiting Game", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What TTFT does a FIFO queue impose when a 2B INT8 assistant has 50 memory-bound tokens left at 51.2 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 39ms, because 2 GB / 51.2 GB/s = 39ms, assuming no queuing delay.", "About 19ms, because the 35 TOPS NPU processes the 50 tokens instantly, leaving only memory overhead.", "About 39ms, assuming the new request can be immediately batched via static batching.", "Nearly 2 seconds, because the new request must wait for 50 tokens at 39ms/token (1,953 ms) to finish."], "correct_index": 3}}, {"id": "mobile-0389", "title": "The Federated Learning Energy Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a device energy consumption perspective, which single operation do you expect to be more expensive for the end-user's battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Sending the 1KB sample, because cellular data transmission is very power-hungry.", "The local training round, because of the high number of floating-point operations (FLOPs).", "The local training round, due to the high energy cost of repeated DRAM access for model parameters.", "They consume roughly the same amount of energy."], "correct_index": 2}}, {"id": "mobile-0391", "title": "The Cellular Energy Tax", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which uses more phone battery: uploading 1 MB over 4G for cloud inference or running a 100 MFLOP NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The on-device inference, by about 100x", "They are roughly equal in energy cost", "Sending the data, by over 10,000x", "Sending the data, but only by about 10x"], "correct_index": 2}}, {"id": "mobile-0394", "title": "The TCO Blindspot: Training vs. Inference", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Over a year, why can cumulative on-device inference energy dominate one-time cloud training energy?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time model training cost, as datacenter GPUs have a very high TDP.", "The cumulative inference cost on user devices, due to the massive scale of the user base.", "They are roughly equivalent; the high power of training is balanced by the scale of inference.", "The energy cost of A/B testing different models before deployment."], "correct_index": 1}}, {"id": "mobile-0395", "title": "The Squeeze-and-Excitation Question", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why are MobileNetV3 squeeze-and-excitation blocks worth 2% extra FLOPs on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0397", "title": "The Trivial Model Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a CPU beat the NPU for a single 100-neuron dense layer on a modern mobile SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0399", "title": "The NCHW vs NHWC Memory Layout", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an NCHW PyTorch model run 3x slower than an NHWC-style TFLite model on an Android CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0401", "title": "The Real-Time Stutter", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose the performance bottleneck by calculating the total frame latency on the older device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is too slow. 25ms for 95 layers is the main bottleneck; the model needs to be pruned to run faster on the NPU.", "The 2ms context switch overhead is the primary issue, as it adds nearly 10% to the latency budget. The model must be re-architected to remove all unsupported ops.", "The total latency is ~42ms (25ms NPU + 15ms CPU + 2ms overhead), exceeding the 33ms budget. The 15ms spent on the CPU is the main bottleneck.", "A 95% delegation ratio is simply not high enough for real-time video; you must achieve at least 99% for the feature to be viable."], "correct_index": 2}}, {"id": "mobile-0404", "title": "The Fallback Tax", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should an iPhone 15 Pro replace unsupported FancyReLU layers to recover 12ms ANE latency without 0.5% accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Implement a custom Metal GPU kernel for `FancyReLU` to accelerate its execution compared to the CPU.", "Apply INT8 quantization to the whole model to reduce the data size and speed up the CPU execution of `FancyReLU`.", "Replace `FancyReLU` with a similar, ANE-supported activation like `SiLU` (Swish) and retrain the model to recover the accuracy.", "Distribute the `FancyReLU` fallbacks across 10 threads to execute them concurrently, bringing total execution time under 1ms."], "correct_index": 2}}, {"id": "mobile-0408", "title": "The Battery Drain Anomaly", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the user complaints regarding data and battery drain?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 2}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 2GB memory footprint causes heavy use of slow flash storage and increasing power draw.", "The 50M parameter model update is approximately 200MB, and transmitting this large payload over a cellular network consumes excessive energy and data.", "The federated learning server is experiencing high latency, maintaining an open connection.", "50M parameters at FP16 is 50MB, causing minor drain. (Calculated trap for wrong precision)."], "correct_index": 1}}, {"id": "mobile-0410", "title": "The Catastrophic Accuracy Drop", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ drop an A17 Pro retail product model from 95% FP32 accuracy to 20% after office-only calibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too complex; INT8 format lacks the precision to represent the weights accurately.", "The Apple Neural Engine does not correctly support a key operator, causing a silent fallback to the CPU with incorrect results.", "The calibration dataset was not representative of the production data, causing activation values to overflow the INT8 range during inference.", "The conversion process introduced too much noise, and the model needs to be re-trained with quantization-aware training (QAT)."], "correct_index": 2}}, {"id": "mobile-0411", "title": "The Janky AR Filter", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which mixed-precision plan gets an AR style transfer filter under 33 ms while protecting one unstable depthwise layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantize the entire model to INT8 to achieve the maximum possible speedup.", "Keep the sensitive layer in FP16 but quantize the other 49 layers to INT8.", "The NPU is the bottleneck. Offload the model to run on the mobile GPU instead.", "The model is too large. Prune 25% of the channels from all layers and retrain the model."], "correct_index": 1}}, {"id": "mobile-0412", "title": "The Deceiving FLOPs Reduction", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a 9x FLOPs reduction from depthwise separable convolutions yield only a small mobile latency improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0413", "title": "The Transformer Tax on Mobile", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose the architectural reason for the ViT's poor latency on mobile hardware, despite its comparable size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The LayerNorm operations in the ViT are not supported by the ANE, causing slow CPU fallbacks.", "The ViT has a much larger total activation memory footprint, exceeding the ANE's on-chip SRAM.", "Self-attention's non-local memory access patterns have low arithmetic intensity, making it bottlenecked by DRAM bandwidth.", "The patch embedding (stem) layer of the ViT uses a large convolution that is inefficient on mobile GPUs."], "correct_index": 2}}, {"id": "mobile-0414", "title": "The Stuttering Generative Keyboard", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What most likely causes keyboard stutter when TTFT is 80ms and TPOT is 39ms against a 16.67ms UI deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large; it should be quantized from FP16 to INT8 to reduce TPOT.", "The inference workload is not being fully delegated to the Apple A17's Neural Engine, falling back to the CPU.", "The inference pipeline is blocking the UI thread; a non-blocking, continuous batching architecture is needed.", "The NPU is overheating and thermally throttling, causing intermittent slowdowns."], "correct_index": 2}}, {"id": "mobile-0415", "title": "The Lagging AR Bounding Box", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is batching four 60 FPS AR frames worse than optimizing single-frame inference below 16.67ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's arithmetic intensity is too low, making it memory-bound and thus unsuitable for batching.", "The system must process frames in real-time. The single-frame inference time (22ms) already violates the 16.67ms deadline, and batching will only increase total latency.", "Batching is correct. It will improve NPU utilization and system throughput, eventually clearing the backlog of frames.", "A dynamic batching system that adapts the batch size based on queue length should be implemented."], "correct_index": 1}}, {"id": "mobile-0416", "title": "The Mobile GPU's Memory-Go-Round", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an unfused Conv2D-BatchNorm-ReLU sequence with 1 MB intermediates slow, and what optimization fixes it?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0417", "title": "The Real-time Rendering Wall", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which model optimization is more likely to meet a 16ms mobile budget: unstructured pruning or dense distillation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Distillation only improves accuracy, it doesn't solve latency issues.", "Pruning halves the FLOPs, so latency will become 100ms, which is a significant improvement.", "A small, dense student model from distillation is NPU-friendly and will likely meet the 16ms budget, whereas the pruned model will see little speedup.", "Neither will work; the only solution is to run the model on the cloud via an API call."], "correct_index": 2}}, {"id": "mobile-0420", "title": "The Case of the Disappearing Pixels", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can ISP resizing from 12MP to 640x480 cause a major small-object accuracy drop despite fast NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0421", "title": "The Privacy Paradox TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What hidden FL TCO driver can dominate when daily on-device keyboard training drains about 2.2% battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 1MB daily cellular data upload for the model update is too expensive for users on limited data plans.", "The daily ~2.2% battery drain from the 5-minute on-device training is perceptible to users, causing them to disable the feature.", "The FL model updates are introducing inference latency ('jank') into the keyboard UI, leading to a poor user experience.", "The on-device training process is filling up the phone's storage with temporary files, causing 'storage full' warnings."], "correct_index": 1}}, {"id": "mobile-0422", "title": "The Mobile Style Transfer Stall", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an AdaIN layer with 1 Ops/Byte far below the A17 Pro ridge point, causing it to be severely bottlenecked by memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The device is thermal throttling, forcing the ANE to run at a lower frequency due to 16.8 M-Ops density.", "The layer is compute-bound; 16.8M operations takes 2ms, implying the ANE is operating at 8.4 GOPS peak.", "The layer is memory-bound; its Arithmetic Intensity (1 Op/Byte) is far below the hardware's Ridge Point (~683 Ops/Byte).", "The ANE is running at low TOPS to improve its TOPS/W power efficiency for this simple layer, resulting in 2ms execution time."], "correct_index": 2}}, {"id": "mobile-0423", "title": "The Corrupted Video Frame", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on mobile hardware pipelines, what is the most likely cause of this post-quantization visual artifacting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The INT8 format has lower precision, causing cumulative rounding errors to build up and corrupt the final output.", "The Apple A17 Neural Engine has a hardware bug in its INT8 matrix multiplication units that is triggered by this model's architecture.", "The calibration dataset was not representative, leading to activation values at inference time overflowing the INT8 dynamic range and getting clipped by the hardware.", "The model's FP16 weights were not properly converted, and their range is too large to fit into INT8, causing them to be clipped before inference begins."], "correct_index": 2}}, {"id": "mobile-0425", "title": "The NPU Architecture Dilemma: CNN vs. ViT", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why might MobileNetV3 beat a smaller-parameter ViT on a mobile NPU despite the ViT having fewer weights?", "chain_ids": ["mobile-chain-auto-secondary-010-09"], "chain_positions": {"mobile-chain-auto-secondary-010-09": 2}, "chain_tiers": {"mobile-chain-auto-secondary-010-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MobileNetV3, because the ViT's self-attention operations are severely memory-bandwidth bound on mobile NPUs.", "The ViT, because its self-attention mechanism consists of large matrix multiplies that can saturate the NPU's compute units.", "The ViT, because it has fewer parameters, which means a smaller memory footprint and faster memory access.", "They will have identical performance, as the NPU is designed to abstract away architectural differences."], "correct_index": 0}}, {"id": "mobile-0426", "title": "The Jank Frame Catastrophe", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the video filter spike from 20 ms to 55 ms when a system notification animates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too complex and must be pruned or quantized further to reduce its baseline latency.", "The DSP is not powerful enough to guarantee real-time performance.", "The OS scheduler is contending for shared GPU resources between your app's inference and the system's UI rendering.", "Thermal throttling from the 20ms inference forces the SoC to downclock instantly."], "correct_index": 2}}, {"id": "mobile-0428", "title": "The Jank-Inducing Mobile Generative Model", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What does the 'work-gap-work-gap' profiler pattern indicate, and how can operator fusion fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is memory-bound: apply unstructured pruning to reduce footprint to 15ms. (Trap: 15ms compute without addressing overhead).", "The model is compute-bound: distill the architecture to save 10ms of compute. (Trap: ignores the 10ms idle gaps).", "The model is dispatch-bound due to kernel launch overhead: use operator fusion to combine sequential operations into fewer, larger kernels.", "The SoC is thermally throttling: add a 10ms cool-down period. (Trap: misinterprets idle gap as thermal throttling)."], "correct_index": 2}}, {"id": "mobile-0430", "title": "The Cross-GPU Choke", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause for the ~600ms of unaccounted-for latency on the server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the server to the storage cluster is saturated.", "Reading the 32 GB tensor from the first GPU's HBM3 memory is the bottleneck.", "The GPUs are communicating over the PCIe bus instead of a direct NVLink bridge, which is saturated by the 32 GB transfer.", "The server's CPU is overloaded with OS context switching, causing the delay."], "correct_index": 2}}, {"id": "mobile-0431", "title": "The Privacy-Aware TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does breach risk change the TCO case for Federated Learning versus centralized keystroke logging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cloud data ingress costs of $60 are far lower than the $250,000 FL engineering cost, so the cloud A/B test is cheaper.", "The on-device compute for FL will drain user batteries, leading to more churn than any potential data leak.", "The expected cost from data leak risk is $50,000 for a single experiment, making the one-time $250,000 FL investment economically rational after just 5 experiments.", "A 5% churn from a data leak is unrealistic; the actual financial risk is likely zero, so the focus should be on the lowest direct cost."], "correct_index": 2}}, {"id": "mobile-0432", "title": "The Mobile Video Filter Jank", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is an 800 GOps, 80 MB video filter compute-bound or memory-bound, and what optimization follows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's memory-bound. The 51.2 GB/s LPDDR5 bandwidth is insufficient.", "It's power-bound. The model's TOPS/W efficiency is too low.", "It's compute-bound. Its arithmetic intensity of 10,000 Ops/Byte is significantly higher than the Ridge Point.", "It's latency-bound. The time to read from UFS 4.0 flash storage is the primary bottleneck."], "correct_index": 2}}, {"id": "mobile-0433", "title": "The Night Vision Quantization Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does daytime-only INT8 calibration make a sign detector miss red stop signs at night?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Nighttime pixels cause integer overflow because the 255 max bin is exceeded by aggressive ISP sensor gain.", "The Snapdragon Hexagon DSP requires asymmetric quantization for low-light scenarios, which was omitted during export.", "The INT8 scale factor of 0.0627 mapped from daytime data forces the 1.0 nighttime dynamic range into only ~16 levels, causing severe quantization noise.", "The 10,000 calibration images exhausted the limited TFLite calibration buffer, forcing a stealth fallback to INT16."], "correct_index": 2}}, {"id": "mobile-0435", "title": "The Smart Reply Latency Puzzle: NAS vs. MoE", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose why the theoretically-efficient MoE model is slower in practice on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0436", "title": "The Real-Time Driver Alert Failure", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a driver alert pipeline meet average latency but fail its 100ms safety deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75ms is the average, so it meets the deadline. The failure must be a memory leak.", "The camera's 33.3ms (30 FPS) arrival rate exceeds the 40ms average inference time, causing a queue overflow.", "The cumulative effect of worst-case scheduler jitter across all pipeline stages pushes the total latency beyond the 100ms budget (110ms).", "The 50ms worst-case inference leaves only 50ms for the remaining 3 stages, which average 35ms, so it should succeed."], "correct_index": 2}}, {"id": "mobile-0439", "title": "The Slow Style Transfer", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What server interconnect issue explains multi-GPU style transfer data transfers taking about 8 seconds instead of milliseconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPUs are in different server racks, and the latency comes from the InfiniBand network connection between them.", "A CUDA driver misconfiguration is forcing data to be copied to CPU host memory before being transferred to the other GPUs.", "The server lacks an NVLink bridge, forcing the GPUs to communicate over the much slower PCIe bus.", "The 250 GB data transfer is simply too large for any interconnect, and this ~4 second latency is expected even with NVLink."], "correct_index": 2}}, {"id": "mobile-0440", "title": "The Live Filter Battery Drain", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an A17 Pro live portrait layer with 151M INT8 ops and 16.8 MB traffic battery-heavy and thermally unstable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; its AI of ~9,000 Ops/Byte exceeds the device's ridge point, meaning the NPU is the bottleneck.", "The layer is memory-bound; its AI of ~9 Ops/Byte is far below the device's ridge point of ~683 Ops/Byte, meaning the NPU is starving for data.", "The bottleneck is the INT8 precision; the A17 Pro's NPU is more efficient with FP16, and switching would resolve the thermal issue.", "The issue is weight-related cache misses; the 128-channel kernel is too large and is thrashing the L1/L2 cache during execution."], "correct_index": 1}}, {"id": "mobile-0441", "title": "The Headlight Saturation Problem", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this night-time failure, and how would you solve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile SoC's NPU lacks the computational power for this model. The model needs to be pruned or a more powerful SoC is required.", "The calibration dataset was not representative, leading to a narrow quantization range and activation overflow on high-dynamic-range night images. Re-calibrate with better data or use mixed-precision for the input layers.", "INT8 precision is fundamentally insufficient for safety-critical perception tasks. The entire model must be reverted to FP16, sacrificing the performance gains.", "This is a sign of a bug in the TFLite/CoreML converter's rounding implementation during quantization. You should try a different version of the conversion tool or report the issue."], "correct_index": 1}}, {"id": "mobile-0442", "title": "The Sluggish Car AI", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause for this massive discrepancy and the resulting UI jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU takes 130ms to compute the initial 100 tokens, blocking the 16ms render thread.", "The 77 GB/s memory bus restricts the prefill to 39ms, which still violates the 16ms render deadline 2 times.", "The prefill step is memory-bandwidth bound; the 195ms blocking call to read weights from DRAM stalls the UI thread.", "The Android OS scheduler is de-prioritizing the inference thread, causing context-switching delays."], "correct_index": 2}}, {"id": "mobile-0443", "title": "The Unstable Gallery Indexer", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 30 FPS video indexing queue grow infinitely when a 30ms/frame model shares the NPU with a 100ms/sec background task?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system has 10% headroom since 30ms < 33.3ms; the queue growth is likely a memory leak.", "The system processing time is 130ms, which is larger than the 33.3ms arrival rate.", "The 100ms background task increases the frame time to 130ms, missing the 33.3ms deadline.", "System utilization is 1.0; the effective service rate matches the arrival rate, causing queue instability."], "correct_index": 3}}, {"id": "mobile-0444", "title": "The AR Frame Drop", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 4K camera frame stall the GPU despite 10ms NPU inference in an AR overlay pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is taking longer than profiled, and the GPU is waiting for the detection results before it can start rendering.", "The unified LPDDR5 memory bus is saturated due to contention between the CPU, GPU, and NPU, slowing down the GPU's data fetch.", "The GPU is stalled waiting for the full 4K camera frame to transfer over the slow MIPI bus into DRAM; the peripheral bus is the bottleneck.", "A cache coherency delay between the NPU's write to memory and the GPU's read from memory is forcing a slow data synchronization."], "correct_index": 2}}, {"id": "mobile-0446", "title": "The Night-Vision Accuracy Collapse", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ keep 98.5% daytime accuracy but collapse below 50% on night clips with headlight activation spikes?", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 0}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile NPU has buggy INT8 support. Keep the problematic layers in FP16 (mixed precision) and quantize the rest.", "The model architecture is unstable. Add more Batch Normalization layers to better regulate activation distributions before re-quantizing.", "The calibration dataset is not representative of night-time conditions, causing activation overflow. Re-run quantization with a new calibration set that includes night-driving clips.", "Per-tensor quantization is too coarse. You must switch to per-channel quantization to provide a more fine-grained scaling factor for each filter."], "correct_index": 2}}, {"id": "mobile-0447", "title": "The Style Transfer Battery Drain", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a 70 GOps, 200 MB style transfer model memory-bound on an A17 Pro, and what should you optimize first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. The 70 G-OPS requires pruning because 70 / 35 TOPS = 2ms compute.", "The model is memory-bound because UFS flash storage is too slow to load weights.", "The model is memory-bound. Its AI of 350 Ops/Byte is below the ridge point (~683 Ops/Byte). Prioritize operator fusion.", "The model is compute-bound because the 35 TOPS NPU cannot sustain 30 FPS at 70 G-OPS."], "correct_index": 2}}, {"id": "mobile-0448", "title": "The Quantization Cliff", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can daytime-only INT8 calibration cause a driver drowsiness model to fail at night?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model architecture is too sensitive for INT8 and must be redesigned with explicit activation clipping functions like ReLU6.", "The calibration dataset was not representative of production data, causing activation saturation (overflow) during night-time inference.", "The mobile NPU has a hardware bug in its INT8 multiply-accumulate unit that causes it to produce erroneous high values.", "The per-tensor quantization scheme is flawed; switching to per-channel quantization is the only way to fix this."], "correct_index": 1}}, {"id": "mobile-0450", "title": "The Battery-Draining Vision Transformer", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the fundamental architectural reason for this significant performance regression on a mobile device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MLP blocks in the ViT are larger than the equivalent CNN layers, requiring more peak compute.", "Self-attention has poor data locality, causing a massive increase in energy-expensive random DRAM accesses.", "The Apple Neural Engine is not optimized for the LayerNorm and Softmax operations used in Transformers.", "The ViT's activation memory exceeds the L2 cache size, forcing it to use the slower main memory."], "correct_index": 1}}, {"id": "mobile-0451", "title": "The Driver Monitoring Deadline Miss", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 30 ms DMS pipeline miss a 50 ms deadline when a 50 ms gesture model runs once per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Landmark Prediction model at 15ms is too slow and must be optimized.", "The low-priority gesture model is blocking the NPU, causing DMS frames to be queued and miss their deadline.", "The hardware TOPS is insufficient for this combined workload.", "The camera's 30 FPS rate is too high, overwhelming the system."], "correct_index": 1}}, {"id": "mobile-0453", "title": "The Mystery of the Slow Avatar", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What backend communication misconfiguration can explain a 1.2s latency gap in a 4-GPU tensor-parallel avatar service?", "chain_ids": ["mobile-chain-auto-secondary-017-08"], "chain_positions": {"mobile-chain-auto-secondary-017-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The communication library (e.g., NCCL) is misconfigured, falling back to the kernel's TCP/IP stack instead of using a direct interconnect.", "The latency is dominated by 40GB of activations transferring over 10 Gbps Ethernet due to cross-node scheduling.", "The 4 GPUs are actually in different servers, and the 1.2s delay is the time spent transferring data over InfiniBand.", "The model's compute time is highly variable, and the 200ms figure is an average; P99 compute is likely over 1.4s."], "correct_index": 0}}, {"id": "mobile-0456", "title": "The Saturated Night-Vision Model", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can an INT8 dashcam sign detector work overall but fail completely on nighttime videos?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU drivers have a bug and don't support the specific convolution types used for processing low-light features.", "The model is suffering from catastrophic forgetting during Quantization-Aware Training (QAT).", "The calibration dataset lacked representative nighttime images, leading to incorrect quantization parameters that saturate activations for dark inputs.", "The FP16 model was already numerically unstable, and the reduced precision of INT8 caused activations to collapse to zero."], "correct_index": 2}}, {"id": "mobile-0457", "title": "The AR Jank Diagnosis", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which model change gets a 5M-parameter ViT AR filter under a 16 ms A17 Pro frame budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0459", "title": "The Mobile 'Training Cluster' Fallacy", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is a 40 Gbps USB4 phone cluster a show-stopper for 1B-parameter data-parallel fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The phone's OS (Android/iOS) lacks the necessary drivers and kernel support for RDMA or collective communication primitives.", "The sustained power draw from the NPU and CPU during training would cause the phones to overheat and thermally throttle, making performance unpredictable.", "The interconnect bandwidth of the USB4 hub is orders of magnitude too low, causing the gradient synchronization step (All-Reduce) to take longer than the computation itself.", "A 1B parameter model with Adam optimizers requires 16GB of memory, which would exceed the available RAM on most phones after accounting for the OS and other apps."], "correct_index": 2}}, {"id": "mobile-0461", "title": "The AR Filter Frame Drop", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What roofline calculation diagnoses the bottleneck for a 500 GOps AR filter moving 3.4 GB per frame on an A17 Pro?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 2}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. 500 G-Ops at 30 FPS requires 15 TOPS, which exceeds the A17 Pro's 35 TOPS at 50% utilization.", "The device is thermally throttling. 3.4 GB/s exceeds the 5 Watt SoC thermal limit.", "The model is memory-bound. Its Arithmetic Intensity of ~147 Ops/Byte is far below the A17 Pro's ridge point (~683 Ops/Byte).", "The OS scheduler is preempting the inference thread because 500 GOps takes 66ms on 35 TOPS."], "correct_index": 2}}, {"id": "mobile-0462", "title": "The Headlight Blind Spot", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ preserve daytime pedestrian accuracy but fail in high-contrast night scenes with headlights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's weights have overflowed the INT8 range because it was not trained with sufficient weight decay.", "The mobile NPU lacks an optimized kernel for the convolutions used in the early layers, causing a fallback to the much slower CPU and breaking the real-time pipeline.", "Activations in early layers are overflowing the 8-bit integer range due to high-contrast scenes, causing a catastrophic loss of precision for the majority of the feature data.", "This is the expected, unavoidable accuracy drop from INT8. The model must be retrained from scratch using Quantization-Aware Training (QAT) to recover the lost accuracy."], "correct_index": 2}}, {"id": "mobile-0463", "title": "The Jank-Inducing Transformer", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does replacing a MobileNetV3 segmentation backbone with a ViT cause mobile video-call jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU lacks hardware support for GELU and LayerNorm operations, forcing them onto the much slower CPU and creating a pipeline stall.", "The ViT model needs to be more aggressively quantized from FP16 to INT8, as the NPU's integer units are currently underutilized.", "The ViT's O(N^2) self-attention mechanism creates large, non-local attention matrices that saturate the SoC's memory bandwidth, stalling the NPU.", "The model's parameter count is too large, exceeding the L2 cache capacity and causing constant, slow main memory access for weights."], "correct_index": 2}}, {"id": "mobile-0465", "title": "The AI Avatar's 4-Second Stall", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What likely causes a 4-second inter-GPU stall inside an 8-GPU H100 avatar inference server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the GPUs is saturated.", "The model is too large, and the all-to-all data transfer is fundamentally limited by PCIe bandwidth.", "A software misconfiguration is forcing GPU communication over the PCIe bus instead of NVLink.", "The CPU is bottlenecking the system, preventing the GPUs from communicating efficiently."], "correct_index": 2}}, {"id": "mobile-0466", "title": "The Federated Learning Thermal Wall", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is sustained on-device federated training blocked by a mobile SoC thermal budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The network cost of uploading model updates daily from 1 million cars would be too expensive.", "The daily energy consumption from on-device training would unacceptably drain the car's main battery.", "The SoC will exceed its 5W thermal budget during the 10-minute training session, leading to severe performance throttling.", "The privacy risk of model gradients leaking sensitive facial data is fundamentally unsolved and a legal blocker."], "correct_index": 2}}, {"id": "mobile-0467", "title": "The Mobile Battery Drainer", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 70 GOps model with 350 MB of weights drain battery and leave the NPU underutilized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound because its 70 G-ops overwhelm the compute capacity.", "The model is memory-bound. The AI calculation (350MB / 70 G-ops) shows it is limited by data transfer.", "The model is memory-bound because its Arithmetic Intensity (200 Ops/Byte) is far below the ridge point (~683 Ops/Byte).", "The model is compute-bound because its Arithmetic Intensity of 200 Ops/Byte is a high number, meaning it is computationally complex."], "correct_index": 2}}, {"id": "mobile-0468", "title": "The Disappearing Cyclist", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ make a dashcam miss cyclists at night when headlights create activation outliers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The original FP16 model was overfit and must be retrained with more night-time data before quantization can be attempted.", "The A17 NPU has a hardware bug in its INT8 convolution kernels that is triggered by high-frequency image features.", "The calibration dataset was not representative, causing activation clipping (overflow) for the night-time cyclist images.", "The model's architecture is inherently unstable for quantization, and INT8 precision is insufficient for this computer vision task."], "correct_index": 2}}, {"id": "mobile-0481", "title": "The Mobile Jank Detective", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which NAS architecture should replace a janky mobile ViT, and why does arithmetic intensity matter?", "chain_ids": ["mobile-chain-auto-secondary-010-12"], "chain_positions": {"mobile-chain-auto-secondary-010-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B (MoE ViT): It has lower total FLOPs, guaranteeing a direct reduction in latency.", "Model A (CNN): Its high arithmetic intensity overcomes the memory bandwidth bottleneck, maximizing NPU utilization.", "Model B (MoE ViT): Dynamic routing avoids the 51.2 GB/s memory bandwidth limit entirely.", "Model A (CNN): It requires less than 1 TOPS, avoiding the NPU's 35 TOPS thermal throttle."], "correct_index": 1}}, {"id": "mobile-0482", "title": "The On-Device Assistant Stutter", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you solve both the high TTFT and the UI jank?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Move the audio transcription to the CPU's efficiency cores and dedicate the NPU entirely to the LLM generation.", "Re-quantize the model to INT4 and apply pruning to reduce the weight size by 50%.", "Implement chunked prompt ingestion and prioritize decode steps over pre-fill steps in the NPU command queue.", "Increase the static batch size to process more user input at once, improving NPU utilization."], "correct_index": 2}}, {"id": "mobile-0483", "title": "The Data Center Mindset on Mobile", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can't we build a custom 64 GB/s PCIe-like interface for the phone to eliminate this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The bottleneck is the mobile link's low bandwidth. We should use a newer, faster standard like USB4 v2 (80 Gbps) to get closer to PCIe speeds.", "The latency comes from serialization/deserialization overhead in the software stack. We should switch to a zero-copy protocol like FlatBuffers.", "The proposal is non-viable due to the prohibitive power consumption of a PCIe-like interface, which would exceed the phone's entire thermal and power budget. The correct approach is on-device model optimization.", "The external accelerator is too slow. The latency is an acceptable trade-off for higher quality, and the battery drain can be solved by asking the user to plug in their phone."], "correct_index": 2}}, {"id": "mobile-0486", "title": "The Night-Vision Overflow", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose and solve this catastrophic performance drop under night-time conditions?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The calibration dataset is not representative of night-driving conditions, causing activation saturation. Re-quantize with a more diverse dataset including night-time samples.", "Post-training quantization is too aggressive for this architecture. The only reliable solution is to implement full Quantization-Aware Training (QAT) to retrain the model from scratch.", "The NPU likely has a hardware bug related to sparse feature maps from IR images. The problematic layers should be forced to run on the more reliable CPU.", "The model's architecture is fundamentally unstable for 8-bit precision. It must be redesigned with more normalization layers."], "correct_index": 0}}, {"id": "mobile-0487", "title": "The Mobile UI Jank Diagnosis", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 20M-parameter ViT cause 25ms mobile UI jank, and what architecture class should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 20M parameters (40MB in FP16) are saturating the A17's 51.2 GB/s memory bandwidth. Quantizing weights to INT8 would halve the memory pressure and fix the jank.", "The A17's Neural Engine likely has poor support for the specific self-attention operations, causing the model to fall back to the much slower CPU.", "The Vision Transformer's self-attention produces large activation tensors that saturate memory bandwidth. The model is memory-bound (~4.6 GFLOPs but 25ms latency), and should be replaced with a CNN architecture that has better data locality.", "The model is too dense. Applying Mixture-of-Experts (MoE) routing would reduce the active parameter count per frame and thus the latency."], "correct_index": 2}}, {"id": "mobile-0488", "title": "The On-Device AI Stutter", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What implementation bug causes generated tokens to slow down over a sequence despite an initially fast first word?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is insufficient for a 1B parameter model, causing a compute bottleneck.", "The LPDDR5 memory bus is saturated from loading model weights for each token, causing a memory bandwidth bottleneck.", "The generation loop is performing a stateless re-computation of the entire sequence for each token due to a missing or misused KV cache.", "The mobile OS is thermal throttling the NPU, causing performance to degrade as generation continues."], "correct_index": 2}}, {"id": "mobile-0489", "title": "The Interconnect Blind Spot", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a cloud-trained tensor-parallel 3B model be a poor fit for mobile deployment and quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model architecture relies on Tensor Parallelism over NVLink, which is absent in the mobile SoC's unified memory architecture.", "The training cluster's InfiniBand network allowed for faster data loading, and the mobile device's slow UFS flash storage is causing a data input bottleneck.", "The Apple A17 Pro's 35 TOPS NPU is simply not enough compute power for a 3B parameter model that was trained on 989 TFLOPS H100s.", "The model requires PCIe Gen5 to feed the accelerator, and the mobile SoC's internal bus protocol has much higher latency, starving the NPU."], "correct_index": 0}}, {"id": "mobile-0490", "title": "The Drowsy Driver Dilemma: Centralized vs. Federated TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which strategy yields a lower Total Cost of Ownership (TCO) for a 100K-vehicle driver drowsiness fleet: centralized collection or Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper: the $5,000 training cost is high but FL's daily uploads add up to more.", "The costs are roughly equivalent: as FL's high frequency of uploads cancels out its smaller data size.", "Federated Learning is cheaper by over $75,000 per year.", "Centralized is cheaper because the weekly cloud training cost of $5,000 dominates all other factors."], "correct_index": 2}}, {"id": "mobile-0493", "title": "The Mobile ViT Deadline Miss", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why can two 10 GFLOP bokeh models have radically different latency on an A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT's non-local memory access pattern for the attention map results in a high number of cache misses on the SoC, which is the primary bottleneck. The FLOP count is secondary.", "The Apple A17's Neural Engine lacks hardware acceleration for the specific Softmax or LayerNorm ops in the ViT, forcing them onto the CPU.", "The ViT's self-attention has a low Arithmetic Intensity, making it memory-bandwidth bound on the A17's memory system.", "The ViT's larger activation sizes are causing cache eviction and thrashing, but the issue is cache capacity, not bandwidth."], "correct_index": 2}}, {"id": "mobile-0494", "title": "The Stuttering AI Assistant: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the missed deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The A17 Pro's compute is insufficient for a 3B model, making the operation compute-bound.", "Continuous inference causes thermal throttling, increasing TTFT to > 60 ms.", "The 8ms static batching queue and the ~44ms memory-bound weight load collectively exceed the 16ms budget.", "The phone's CPU dispatch latency adds a 50 ms overhead before the Neural Engine starts."], "correct_index": 2}}, {"id": "mobile-0496", "title": "The Federated Photobomber Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What cost makes the proposed Federated Learning training plan impractical despite privacy benefits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The network bandwidth required to upload model updates from 10 million clients across 1,000 rounds is the primary cost bottleneck.", "The cumulative on-device energy consumption will lead to unacceptable battery drain for users.", "The cloud compute cost for the central server to aggregate gradients from millions of clients will be the most expensive part of the system.", "The model will likely have poor final accuracy due to non-IID data from users, making the effort technically infeasible regardless of cost."], "correct_index": 1}}, {"id": "mobile-0499", "title": "The Mobile Transformer Jank Puzzle", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an A17 Pro ViT video filter hit only 15 FPS with 30% ANE utilization but saturated memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too small for the NPU, and the batch size should be increased to improve utilization.", "The NPU driver has a bug causing inefficient scheduling of Transformer operations, and it should be reported to the vendor.", "The ViT's self-attention is memory-bound; its arithmetic intensity is too low for the A17's hardware balance.", "The model must be quantized from FP16 to INT8, as the computational load is clearly too high."], "correct_index": 2}}, {"id": "mobile-0500", "title": "The SoC Shuffle Tax", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the SoC shuffle tax, and how does CPU-NPU ping-ponging an 8 MB tensor hurt latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0503", "title": "The Transformer's Mobile Traffic Jam", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does a ViT object detector with a similar parameter count run far slower than MobileNetV2 on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT has a larger total activation memory size, which overflows the NPU's limited on-chip SRAM.", "The Vision Transformer has significantly more total FLOPs (Floating Point Operations) than the MobileNetV2.", "The ViT's self-attention has low Arithmetic Intensity, making it memory-bandwidth bound on the NPU.", "The A17 Neural Engine lacks optimized hardware kernels for the Softmax and LayerNorm operations within the ViT."], "correct_index": 2}}, {"id": "mobile-0504", "title": "The Sluggish Smart Reply: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching make smart reply TTFT exceed 300 ms even though TPOT is 80 ms?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 2}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 2B parameter model is too large for the mobile NPU, causing slow inference.", "The static batching policy is introducing excessive queuing delay before inference begins.", "LPDDR5 memory bandwidth is saturated, creating a bottleneck when loading model weights for each batch.", "The CPU is too slow at tokenizing and preparing the input tensors, starving the NPU."], "correct_index": 1}}, {"id": "mobile-0505", "title": "The Mobile App's Cloud Latency Mystery", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What intra-server GPU interconnect bottleneck most likely explains 85ms sync time in a 2-GPU H100 inference service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the servers in the rack is saturated.", "The model is too large, causing the compute phases on the H100s to be the bottleneck.", "The two GPUs are communicating over the PCIe bus instead of NVLink, likely due to a server misconfiguration.", "The GPU HBM is full, causing the system to swap activations to slower system DRAM."], "correct_index": 2}}, {"id": "mobile-0506", "title": "The Privacy vs. Price Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which first-year Total Cost of Ownership (TCO) is lower for mobile keyboard autocorrect: centralized training or Federated Learning?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 2}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper ($75k vs $236.5k) due to high FL CapEx.", "FL is cheaper ($36.5k vs $50k) due to lower cloud compute.", "FL is cheaper ($36.5k vs $75k) due to no data storage costs.", "Centralized is cheaper ($50k vs $200k) due to server costs."], "correct_index": 0}}, {"id": "mobile-0510", "title": "The Stuttering On-Device Assistant", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling architecture should an on-device voice assistant use to reduce queueing delay during generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply aggressive 4-bit quantization to the model to reduce TPOT below the 16ms deadline.", "Redesign the UI to be fully asynchronous, only displaying the full text once the entire sequence is generated in the background.", "Implement continuous batching to process new user input at the token-level alongside existing generation, minimizing queueing delay.", "Conclude the NPU TOPS are insufficient and offload inference to the cloud."], "correct_index": 2}}, {"id": "mobile-0511", "title": "The Distributed Training Scaling Failure", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What cross-node interconnect bottleneck explains poor scaling when teacher-model training grows from 2 to 8 nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The latency is 8 seconds per step, indicating the PCIe Gen5 bus connecting the GPUs to the CPU is saturated.", "The throughput is 1.5x, meaning NVLink 4.0 bandwidth between GPUs within each node is the bottleneck.", "The latency is 8 seconds per step due to using standard Ethernet instead of an RDMA-capable interconnect like InfiniBand.", "The model's low arithmetic intensity limits the speedup to 1.5x, requiring larger batch sizes."], "correct_index": 2}}, {"id": "mobile-0512", "title": "The Voice Assistant A/B Test Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What strategy avoids astronomical cellular upload costs while preserving privacy for a 1M-vehicle voice assistant A/B test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0514", "title": "The Saturated AR Filter", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ make a MobileNetV3 AR filter glitch only in bright outdoor scenes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The post-training quantization (PTQ) calibration dataset was not representative of production data, causing activation values to overflow the INT8 range in bright scenes.", "The Neural Engine has a documented hardware bug when performing INT8 convolutions.", "MobileNetV3's architecture uses swish activations which are numerically unstable in INT8.", "The model was quantized using dynamic quantization, which is too slow."], "correct_index": 0}}, {"id": "mobile-0515", "title": "The Infotainment Jank", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which architecture should replace a janky ViT on an infotainment system, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is memory-bandwidth-bound due to attention's low arithmetic intensity. Choose the MobileNetV3, as its convolutional structure provides better data locality.", "The ViT is compute-bound because 22M parameters is too large. Choose the ViT-MoE.", "The problem is model size. Choose the ViT-MoE as it has fewer active parameters.", "The bottleneck is likely an inefficient operator in the ViT model that isn't supported."], "correct_index": 0}}, {"id": "mobile-0516", "title": "The Laggy AI Assistant", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 100ms static batching timeout make a mobile AI suggestion queue unstable for fast typing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 150ms TTFT is too slow for a real-time mobile application and must be optimized to be less than the 100ms request arrival interval.", "The max batch size of 4 is too large, causing high latency. Reducing it to 1 would solve the problem.", "The 100ms static batching timeout means the system's service time (250ms) exceeds the request arrival interval (100ms), causing the request queue to become unstable.", "The latency is caused by the 100ms batching interval compounding with 150ms TTFT, leading to exactly 1.5 seconds of queue wait time after 6 keystrokes."], "correct_index": 2}}, {"id": "mobile-0517", "title": "The Federated Economics Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 10M smart-reply DAU, how does centralized logging TCO compare with an $800K federated-learning buildout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Option A (Centralized) is better because the $800,000 upfront engineering cost for FL makes it too expensive.", "Option A (Centralized) has a much lower infrastructure TCO (~$4.2k vs ~$327k/year), but Option B (Federated) may be the true cheaper option when regulatory and breach risks are factored in.", "Option A (Centralized) is better; the daily costs are a standard operational expense and the privacy risk can be mitigated with user agreements.", "Option B (Federated) is cheaper in pure infrastructure costs, even when amortizing the engineering cost."], "correct_index": 1}}, {"id": "mobile-0520", "title": "The Overexposed Image Crash", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 calibration on lab images make a smart-exposure model fail outdoors in bright sun?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's weights have a dynamic range too large for INT8, causing weight overflow during conversion.", "The NPU has a known hardware bug when handling certain INT8 convolution operations.", "The calibration dataset was not representative of outdoor scenes, leading to activation values overflowing the INT8 range.", "The increased temperature of the phone from being in the sun caused thermal throttling."], "correct_index": 2}}, {"id": "mobile-0522", "title": "The Sluggish Co-Pilot", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What inference phase explains an 800ms TTFT with 50ms TPOT in an on-device 3B LLM assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large, making the NPU completely memory-bound for both phases.", "The model is being loaded from slow flash storage on every request, causing high cold-start latency.", "The initial prompt processing (pre-fill) is highly compute-bound due to the large GEMM operation required for the entire prompt sequence.", "The CPU is taking too long to tokenize the user's text input before sending it to the Neural Engine."], "correct_index": 2}}, {"id": "mobile-0523", "title": "The Multi-Node Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does scaling from 8 to 32 H100s improve throughput only 20% when AllReduce consumes 70% of each step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from transferring gradients between the GPU and the network card for inter-node communication.", "The model's activations are causing HBM cache misses, and the system is stalling on main memory access.", "Communication is bottlenecked by the ~18x bandwidth drop when moving from the 900 GB/s intra-node NVLink to the ~50 GB/s inter-node InfiniBand fabric.", "The CPU is unable to schedule the `AllReduce` kernels fast enough across the 32 GPUs, creating a dispatch bottleneck."], "correct_index": 2}}, {"id": "mobile-0524", "title": "The Real-Time Filter Lag", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the custom real-time filter layer memory-bound or compute-bound, and what arithmetic intensity evidence supports that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound because 7 GOps takes 200ms on a 35 TOPS chip.", "The layer is memory-bound because its Arithmetic Intensity (140 Ops/Byte) is less than the A17 Pro's ridge point (~683 Ops/Byte).", "The layer is compute-bound because its Arithmetic Intensity is 0.14 Ops/Byte (7/50), below the ridge point.", "The layer is memory-bound because 50 MB takes 50ms at 1 GB/s."], "correct_index": 1}}, {"id": "mobile-0526", "title": "The Keyboard Jank Crisis", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What architectural path best fixes a 50M Transformer keyboard model that misses latency and memory budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is memory-bandwidth bound. Switch to a MobileNet-style CNN using depthwise separable convolutions to create larger, more parallelizable operations.", "The model is too large. Shrink the Transformer to 25M parameters to cut the memory and compute costs by 50%, which will meet the latency target.", "The NPU is being starved. Use a Neural Architecture Search (NAS) to automatically find a more efficient Transformer block.", "The model is compute-bound. Apply 4-bit quantization to the existing model to increase throughput and meet the latency target."], "correct_index": 3}}, {"id": "mobile-0527", "title": "The On-Device Copilot's Janky Keyboard", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching four 20-token keyboard predictions on A17 Pro cause 16ms UI deadline misses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is too slow. With a 25% utilization rate, it is clear the hardware cannot keep up with the 10 request/sec arrival rate, causing the request queue to grow indefinitely and block the UI thread.", "The memory bandwidth is the bottleneck. The model is too large, and the NPU is spending most of its time waiting for data from DRAM instead of computing, which is why the processing time is so long.", "The static batching creates head-of-line blocking. The 100ms uninterruptible batch processing time exceeds the 16ms UI deadline, causing frame drops. A switch to continuous batching is needed.", "The batch size is too large. Reducing the batch size to 1 is the only way to minimize latency and ensure the 16ms deadline is never missed, even if it means lower overall throughput."], "correct_index": 2}}, {"id": "mobile-0528", "title": "The Privacy vs. Profit A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What net daily value do centralized 95% and federated 80% smart-reply models create for 1M users at 10 suggestions/day?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cloud model is superior by $17,500 per day, as it generates far more value from accepted suggestions.", "The Federated Learning model is the only responsible choice, as the potential cost of a data breach outweighs any daily revenue metric.", "The Cloud model is only marginally better, with a net value gain of $14,000 per day.", "The Federated Learning approach has a daily opportunity cost of $9,000, forcing a decision between immediate user value and long-term privacy strategy."], "correct_index": 3}}, {"id": "mobile-0529", "title": "The Cross-Country Trip on a City Bus", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the cloud LLM server showing 500ms P99 latency despite fast network RTT and high GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server is compute-bound because GPU utilization is at 90%, and a more powerful GPU like the B200 is needed.", "The HBM3 memory latency is too high (~300 ns), creating a memory wall that stalls the GPU cores.", "The system is communication-bound due to using the low-bandwidth PCIe bus for tensor parallel exchanges instead of the high-bandwidth NVLink fabric.", "The data center's InfiniBand network is saturated, as the 8 GPUs are likely in different racks, causing high latency between them."], "correct_index": 2}}, {"id": "mobile-0532", "title": "The Mobile-ViT Latency Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the Micro-ViT disproportionately slower than MobileNetV3 on A17 Pro, and what architecture best balances accuracy and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is compute-bound because 1.5 GFLOPs / 30ms = 50 GFLOPS, exceeding the hardware's sustained rate.", "The ViT model is too large for the cache. We must apply more aggressive INT4 quantization and 50% unstructured pruning to reduce the memory footprint.", "The ViT is memory-bandwidth bound due to the low arithmetic intensity of self-attention. Propose a hybrid CNN-Transformer architecture, potentially found via NAS, that uses efficient depthwise separable convolutions for early stages and attention for later stages.", "Increase model capacity by using a Mixture of Experts (MoE) layer, which keeps inference FLOPs constant by only activating one expert. This will improve accuracy without increasing latency."], "correct_index": 2}}, {"id": "mobile-0533", "title": "The Janky Visual Search", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given that the mobile network latency is consistently around 50ms and the model's compute-per-expert is stable, what is the most likely cause of this high latency variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The backend's InfiniBand network, used for RDMA, is dropping packets, causing high-latency fallbacks.", "The model's KV-cache is spilling from HBM to much slower system RAM, but only for certain inputs that activate the largest experts.", "The server lacks full NVLink connectivity, and the MoE router is picking experts on GPUs across different CPU sockets, forcing slow communication over the PCIe bus and inter-socket links.", "The mobile client's cellular connection has high packet loss, and TCP retransmission delays are causing the latency spikes."], "correct_index": 2}}, {"id": "mobile-0534", "title": "The Mobile Style Transfer Jank", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the 200 GOps style transfer layer compute-bound or memory-bound, and what roofline evidence supports that diagnosis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The 35 TOPS of the NPU is insufficient to execute 200 Giga-ops within the 16ms time budget.", "Memory-bound. The layer's Arithmetic Intensity (400 Ops/Byte) is below the hardware ridge point (~683 Ops/Byte).", "Thermally-bound. The SoC is likely overheating and throttling the NPU clock speed, a common issue in slim devices under sustained load.", "Memory-bound. The 500 MB of data is too large to fit in the on-chip caches, forcing slow reads from main RAM."], "correct_index": 1}}, {"id": "mobile-0536", "title": "The Laggy Mobile Video Filter", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What backend topology problem most likely explains the 150ms P99 latency for the cloud-based mobile video filter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile client's 5G connection has high jitter, causing unpredictable packet delays.", "The data transfer from the server's network card to the GPU over the PCIe bus is the primary bottleneck.", "The backend orchestrator is placing the two model GPUs on different servers, forcing traffic over the slower InfiniBand network instead of NVLink.", "The H100 GPU is running too hot under load and is being thermally throttled, increasing inference time."], "correct_index": 2}}, {"id": "mobile-0537", "title": "The Mobile Video Battery Drain", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 3x3 depthwise convolution on a 112x112x256 INT8 tensor drain battery on A17 Pro?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; the 3x3 kernel requires too many FLOPs for the NPU to sustain 60 FPS.", "The layer is memory-bound; its arithmetic intensity (9 Ops/Byte) is far below the A17's ridge point, stalling the NPU.", "Thermal throttling is artificially reducing the ridge point to 10 Ops/Byte, matching the layer's intensity.", "The INT8 data type induces a 4x penalty on memory bus transfers compared to native FP16 execution."], "correct_index": 1}}, {"id": "mobile-0538", "title": "The Headlight False Positive", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the root cause of the catastrophic INT8 accuracy drop during night driving with bright headlights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The SoC's NPU has a hardware bug when processing high-frequency image data at night.", "The CNN architecture has overfit to daytime images and is not robust to the domain shift of night driving.", "The INT8 calibration range is too narrow, causing activation value saturation when encountering high-contrast night scenes.", "The camera's auto-exposure control is failing at night, delivering incorrectly normalized input frames to the model."], "correct_index": 2}}, {"id": "mobile-0539", "title": "The On-Device Search Dilemma", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the ViT miss the 16ms mobile jank budget compared with the NAS-proposed depthwise CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT's non-local memory access pattern for the attention map results in a high number of cache misses on the SoC, which is the primary bottleneck. The FLOP count is secondary.", "The Apple A17's Neural Engine lacks hardware acceleration for the specific Softmax or LayerNorm ops in the ViT, forcing CPU fallback.", "The ViT's computational cost scales quadratically with input tokens, making it ~7.4x more expensive (~67 MFLOPs vs ~9 MFLOPs). This FLOP gap is the root cause.", "The model is bottlenecked by memory bandwidth, not compute. The fix is to quantize from FP16 to INT8, halving data movement and solving latency without changing architecture."], "correct_index": 0}}, {"id": "mobile-0540", "title": "The Live Translation Freeze", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What causes the live translation feature's excessive TTFT when static batching waits for four pending requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's single-token inference time of 40ms is too slow for the 80ms budget and requires further optimization.", "The Snapdragon 8 Gen 3 NPU is saturated by a batch of 4, causing thermal throttling and increased latency.", "The static batching policy creates excessive queueing delay, as early requests are starved waiting for the batch to fill.", "The device's LPDDR5x memory bandwidth is insufficient for batching, causing stalls when moving four requests' data to the NPU."], "correct_index": 2}}, {"id": "mobile-0541", "title": "The Federated Learning TCO Trap", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What drives the much higher network TCO of the federated keyboard personalization variant, and how should the trade-off be framed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cost is high because 5% of 5 million users is too large; the participation rate should be lowered to 0.1% to make it cost-effective.", "The FL approach transfers 50,000x more data per day (5 TB vs. 100 MB). This higher TCO is a direct trade-off for enhanced user privacy and reduced data breach liability, as no raw text ever leaves the device.", "The real cost isn't the network, but the on-device compute draining user batteries. The network cost is a secondary concern that can be ignored.", "The centralized approach is cheaper and therefore better. Properly anonymized text is 'good enough' for privacy, and the massive cost savings are the most important business factor."], "correct_index": 1}}, {"id": "mobile-0542", "title": "The Mobile App's Cloud Bottleneck", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the AI Retouch backend spend 250ms moving only 50MB over PCIe Gen5?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated and cannot handle the 50MB transfer load efficiently.", "Model execution (400ms) is the primary bottleneck; the transfer time is secondary and should be ignored.", "The data is being moved in thousands of small chunks, making the transfer latency-bound.", "The server's InfiniBand network is causing contention on the PCIe bus, slowing down the transfer."], "correct_index": 2}}, {"id": "mobile-0543", "title": "The Real-Time Filter Jank", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, what bottleneck explains the real-time video filter's 24ms frame latency on an A17-class device?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 2}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. An arithmetic intensity of 58.3 Ops/Byte is very high, indicating a workload dominated by calculations.", "The model is compute-bound. The A17 Pro's 35 TOPS is not sufficient to process 70 G-Ops within the 16ms frame budget.", "The model is memory-bound. Its arithmetic intensity of 58.3 Ops/Byte is significantly lower than the A17 Pro's ridge point of ~700 Ops/Byte.", "The bottleneck is thermal throttling. The workload is exceeding the SoC's power budget, forcing it to slow down, regardless of the model's specifics."], "correct_index": 2}}, {"id": "mobile-0544", "title": "The Night-Vision Blind Spot", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ make a pedestrian detector miss night scenes when daytime calibration has only 100 images?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MobileNet architecture's Hard-Swish activations are unstable for quantization and are causing numerical errors.", "The phone's NPU has a hardware bug in its INT8 arithmetic unit that incorrectly handles high-value multiplications.", "The calibration dataset lacked representative nighttime images, leading to incorrect quantization parameters that saturate activations for dark inputs.", "The model is overfitting to the training data and needs to be completely retrained with a lower learning rate and more regularization."], "correct_index": 2}}, {"id": "mobile-0545", "title": "The MobileNet Migration Diagnosis", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which architectural change should replace the slow ResNet-style layer to meet the mobile visual sticker budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0546", "title": "The AI Keyboard Jank", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching cause the AI keyboard to freeze despite acceptable single-request TTFT and TPOT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The latency equals (80ms + 50ms) * 4 = 520ms, proving the NPU is saturated and batch size must decrease.", "The 80ms TTFT is inherently too slow for real-time UI, causing the jank.", "Static batching forces short requests to wait ~530ms for the longest sequence to finish via head-of-line blocking.", "The average latency of 330ms indicates the shared LPDDR5 memory bus is saturated."], "correct_index": 2}}, {"id": "mobile-0547", "title": "The Topologically Flawed Upgrade", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is replacing InfiniBand with NVLink a flawed fix for multi-node AllReduce bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand NDR (400 Gbps) provides higher sustained bandwidth across nodes than NVLink 4.0 (900 GB/s) due to overhead.", "It is better to connect the nodes directly using PCIe Gen5 for lower latency.", "NVLink is an intra-node interconnect for GPU-to-GPU communication within a server, while InfiniBand is for inter-node communication.", "The bottleneck is a software issue in the AllReduce algorithm's implementation, not a hardware limitation."], "correct_index": 2}}, {"id": "mobile-0550", "title": "The Sluggish Voice Assistant", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the in-car voice assistant have slow TTFT even though subsequent tokens generate quickly?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 2}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The prompt prefill computation is too large, exceeding the NPU's per-operation compute budget and causing a hardware fault.", "The system is running out of on-chip memory when both tasks are loaded, forcing slow data swaps from main DRAM during prefill.", "The FIFO queue is causing head-of-line blocking; the high-priority voice query is stuck waiting for the entire low-priority background task to complete.", "The fast TPOT generation for the previous task is saturating the memory bus, preventing the voice prompt from being loaded onto the NPU."], "correct_index": 2}}, {"id": "mobile-0551", "title": "The Mobile App's Server-Side Stall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What server-side data ingress bottleneck explains the AI Photo Editor's missing 105ms of latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile device's 5G uplink has 105ms of high packet loss, forcing numerous retries.", "Data transfer over the server's PCIe bus from CPU RAM to GPU HBM adds ~105ms due to software-mediated copies.", "The H100 server uses InfiniBand to connect to storage, adding 105ms of latency.", "The server's NVLink interconnects are saturated, adding 105ms to the data copy."], "correct_index": 1}}, {"id": "mobile-0554", "title": "The Mobile NAS Showdown: CNN vs. Sparse Transformer", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which NAS candidate should be chosen for the NPU video filter, and why does hardware efficiency outweigh raw operation count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Candidate B, because its total active operations (47.5 GOps) are significantly lower than the original 100 GOps ViT.", "Candidate B, because Transformers are architecturally superior to CNNs and MoE makes them efficient enough for mobile deployment.", "Candidate A, because its regular, dense structure achieves much higher architectural efficiency (eta_arch) on the mobile NPU, resulting in a latency of ~1.1ms vs ~4.5ms for the MoE model.", "Candidate A, because its total GOps (30) are lower than the MoE ViT's (47.5), and lower GOps always means lower latency."], "correct_index": 2}}, {"id": "mobile-0555", "title": "The Stuttering Translator", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batch size 4 cause a 500 ms first-token delay for a 1B translator on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 1B model size causes a 500ms memory bottleneck because 1GB / 2 GB/s effective bandwidth = 500ms.", "The 35 TOPS NPU takes 500ms to process the 4-token prefill batch.", "The static batch size of 4 forces a ~500ms queuing delay while waiting for user input. Switch to continuous batching.", "The 51.2 GB/s bandwidth limits prefill of 4 batched tokens to ~500ms."], "correct_index": 2}}, {"id": "mobile-0556", "title": "The Lane Centering TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which retraining strategy is economically viable for the fleet, and what data-transfer cost drives the decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized Training, because the annual cost is around $66,000, which is a reasonable R&D expense.", "Federated Learning, because the Centralized approach would cost over $1.2 million annually in data fees.", "Centralized Training, as the powerful cloud GPUs result in a better model, and data costs are negligible.", "Federated Learning, to avoid user complaints about battery drain."], "correct_index": 1}}, {"id": "mobile-0557", "title": "The Laggy Generative Uncrop", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What inter-GPU communication bottleneck explains the 1.5s latency of the Generative Uncrop backend?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0559", "title": "The Saturated Stop Sign", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this catastrophic accuracy drop for a single class, and how would you solve it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Snapdragon NPU has a known bug with certain convolutional operators when handling saturated INT8 values. You should force this specific layer to run on the CPU or GPU instead of the NPU.", "The model requires higher precision for this feature. You should use mixed precision, keeping the final convolutional and classification layers in FP16 while quantizing the rest to INT8.", "The weights in the layer have a high dynamic range, not the activations. You should apply per-channel quantization to the weights of the final convolutional layer instead of per-tensor.", "The high activation values from stop signs are creating a large dynamic range, crushing the quantization resolution for all other classes. You should use a percentile-based clipping calibration method to ignore these rare outliers."], "correct_index": 3}}, {"id": "mobile-0560", "title": "The Mobile MoE Trade-off", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which option demonstrates the most sophisticated understanding of on-device constraints and is the most promising to investigate further?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Micro-ViT, because Transformers are the most powerful modern architecture and worth the integration cost.", "The EfficientNet-B0, because it's a standard architecture that provides a performance boost and comfortably fits the latency budget.", "The MoE Hybrid, because it uniquely decouples model capacity (total parameters) from inference cost (active FLOPs), fitting the task and budget.", "Both the EfficientNet-B0 and the MoE Hybrid are equally valid choices since both are well under the 25ms latency budget."], "correct_index": 2}}, {"id": "mobile-0561", "title": "The Stuttering Voice Assistant", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching make the on-device voice assistant emit words in bursts instead of a smooth stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU's 10ms-per-token performance is too slow; the model must be quantized from FP16 to INT8 to increase raw token throughput.", "The LPDDR5 memory bandwidth is saturated by loading activations for the whole batch. The model's hidden dimension size must be reduced.", "The static batching forces each user to wait for the entire batch's tokens to be generated for every word, causing high TPOT. Switch to continuous batching to decouple requests and stream tokens back smoothly.", "The 50ms batching timeout is too high, causing unacceptable TTFT. The timeout should be reduced to 10ms to make the system more responsive."], "correct_index": 2}}, {"id": "mobile-0563", "title": "The Predictive Charging TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which predictive charging training approach is cheaper, and how does participation rate change the TCO calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper by ~$131,400 because FL uploads 5MB per user instead of 1MB.", "Federated Learning is more expensive by ~$147,825 assuming 100% participation for 5MB uploads.", "Federated Learning is cheaper by ~$16,425 because the 10% participation rate offsets the 5x larger payload.", "Both approaches cost exactly $32,850 because bandwidth compresses model updates."], "correct_index": 2}}, {"id": "mobile-0564", "title": "The Keyboard's Privacy-Cost Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which smart keyboard personalization approach should the A/B test use, considering privacy, battery, and data TCO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Choose Centralized (A). The battery drain from on-device training is unacceptable and will cause user churn.", "Choose Centralized (A). The annual data upload cost of $24,000 is high, but still cheaper than the data costs from federated learning updates.", "Choose Federated (B). It has a lower annual data TCO (~$3.6K vs ~$24K) and critically, avoids uploading sensitive user keystrokes, making it the only viable option from a privacy standpoint.", "Choose Federated (B), because its annual data TCO is only $3,600, which is an order of magnitude cheaper than the Centralized approach's multi-million dollar data bill."], "correct_index": 2}}, {"id": "mobile-0565", "title": "The Smart Keyboard TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which smart keyboard training strategy has the lower full TCO after accounting for network and user battery costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0566", "title": "The Mobile 'Pro' Upgrade Dilemma", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should effective throughput be calculated to determine which upgrade option satisfies the 20ms latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MoE model adds 50% more parameters, so its latency will increase by 50% to 27ms, making it unviable.", "Scaling the dense model to 6 G-MACs will double the latency to 36ms, but the MoE model will have a latency of ~19.8ms, making it the only viable option.", "Since the Snapdragon NPU has 45 TOPS, both models are easily within budget. We should choose the simple scaled model.", "Both approaches increase MACs, so both will fail the latency budget. It's impossible to improve accuracy without increasing latency."], "correct_index": 1}}, {"id": "mobile-0567", "title": "The 'Smart Reply' Battery Drain Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which Smart Reply proposal should launch after comparing the one-year Total Cost of Ownership (TCO) and the federated option's battery drain impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Choose Federated Learning. Its 1-year TCO of $710k is higher, but the privacy guarantees are worth the user battery drain of ~1.5%.", "Choose Centralized. Its 1-year TCO is $210k lower, and the Federated Learning option's 2.22% daily battery drain exceeds the 2% user-acceptance threshold.", "Choose Federated Learning. Its server-side operational costs are 6x lower, making it cheaper in the long run, and the battery impact is 0.22%.", "Choose Centralized. Federated Learning 1-year TCO is $630k, which is more expensive than $500k."], "correct_index": 1}}, {"id": "mobile-0568", "title": "The Fleet-Wide TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which cabin pre-warming training approach is cheaper once engineering operations cost is included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Federated Learning is cheaper because the weekly model updates result in less data transfer cost than daily user logs.", "Centralized is cheaper because training one large model is more compute-efficient than aggregating millions of small ones.", "Centralized is ~7x cheaper; the primary driver is the steep engineering and operational overhead of the Federated Learning system.", "Federated Learning must be chosen because its privacy benefits are paramount and the company should bear any cost."], "correct_index": 2}}, {"id": "mobile-0569", "title": "The Federated Fleet TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which Driver Alertness architecture has lower first-year TCO, centralized cloud inference or decentralized federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Centralized, because the $1.25M engineering cost for FL is too high to overcome.", "B) They are roughly equivalent in cost, so the decision should be based on model accuracy alone.", "C) Decentralized (FL), as its high initial engineering cost is less than the massive, recurring data transfer and compute costs.", "D) Centralized, because data ingress costs are negligible and modern GPUs are highly efficient."], "correct_index": 2}}, {"id": "mobile-0570", "title": "The Drowsiness Detection TCO Dilemma", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 1M smart-driving DAU, is cloud inference or on-device FL cheaper after FL infrastructure and battery churn?", "chain_ids": ["mobile-chain-auto-secondary-008-03"], "chain_positions": {"mobile-chain-auto-secondary-008-03": 2}, "chain_tiers": {"mobile-chain-auto-secondary-008-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The federated approach is too expensive; the revenue lost from 0.5% churn is over $250,000.", "The centralized approach is cheaper because the cloud inference cost is minimal, and data transfer costs are negligible for small clips.", "The centralized approach's annual cost is over $300k, dominated by data egress and inference, making the federated option's ~$106k Year 1 cost (with negligible churn) far cheaper.", "Both options are roughly equivalent in cost once you factor in the engineering salary required to maintain the Federated Learning infrastructure."], "correct_index": 2}}, {"id": "mobile-0571", "title": "The Route Prediction TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which Smart Routes proposal is more cost-effective over time once cloud costs, FL infrastructure, and battery churn are included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Proposal A is cheaper. The battery drain cost for Proposal B is over $700k because the energy calculation was not converted from minutes to hours.", "Proposal B is significantly cheaper (by over $60k), as the only cost is the user battery impact; the server cost is negligible.", "Proposal B is cheaper in the long run (saving ~$19,100/year in recurring costs), despite higher Year 1 CapEx. The cloud training cost for A is significantly higher than the combined FL server and fleet-wide battery 'churn' cost for B.", "Proposal A is cheaper by ~$58,000. The Federated Learning 'churn cost' alone is higher than the entire cloud budget."], "correct_index": 2}}, {"id": "mobile-0572", "title": "The GPU Context Switch Overhead", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where are the missing 8ms going?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0573", "title": "The On-Device LLM Feasibility Check", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Walk through the feasibility analysis — memory, compute, latency, and battery — to assess whether a 3B on-device chatbot can meet phone constraints.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0574", "title": "The NPU Delegation Failure Modes", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is 87% NPU delegation potentially worse for latency than 0% delegation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0575", "title": "The ANE vs GPU Power Efficiency", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you compare NPU and GPU TOPS per watt, and when might the GPU still be preferable despite lower efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0576", "title": "The Metal Performance Shaders", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "When is rewriting a custom layer in Metal compute shaders faster than CoreML, and when does it backfire?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0577", "title": "The On-Device Speech Recognition", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the RTF for each, and explain why Whisper's architecture fundamentally conflicts with streaming ASR even when the RTF is acceptable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0578", "title": "The NNAPI Fragmentation Problem", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the same TFLite INT8 model exhibit a massive 3x latency regression (22ms vs 6ms) on a specific flagship Exynos NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0579", "title": "The Transformer vs CNN on Mobile", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is ViT-Small disproportionately slower than MobileNetV3 on Snapdragon 8 Gen 3 despite having only 21x more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0580", "title": "The ANE Delegation Regression", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Core ML detector regress from 3 ms on A17 Pro ANE to 30 ms CPU inference after an iOS update?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 2}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0581", "title": "The Pocket Oven LLM", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the thermal budget and design a system that prevents overheating?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0582", "title": "The Noisy Environment Speech Failure", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you fix speech recognition noise robustness within a 50 MB mobile model budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0583", "title": "The Real-Time Video ML Frame Drop", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's consuming the other 21ms per frame and causing the drop to 15 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0585", "title": "The Data Starvation NPU", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 10 TOPS Android NPU deliver only 2 TOPS on the detector, and how would you confirm memory bandwidth starvation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0587", "title": "The 15 FPS Video ML Bottleneck", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can't three 30 FPS models run at 30 FPS together?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0588", "title": "The Snapdragon 8 Elite NPU Scheduling", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Snapdragon 8 Elite NPU latencies spike from 5ms to 18ms while the CPU decodes 4K H.265 video, considering the shared 6 MB system-level cache (SLC)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0589", "title": "The MediaTek Dimensity APU Architecture", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a QNN-optimized speech model run 13x slower after switching from Hexagon to MediaTek NeuroPilot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0590", "title": "The Samsung Exynos NPU Fragmentation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you ship one model that works across three Exynos generations with vastly different NPU architectures?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0591", "title": "The CoreML vs TFLite Performance Gap", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What explains Core ML's 1.8 ms MobileNetV3 latency on iPhone versus TFLite's 5.2 ms on Android?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0592", "title": "The ISP Format Conversion Bottleneck", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 5ms NPU model cap at 12 FPS when 4K frames are resized through OpenCV on the CPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0593", "title": "The Double-Precision Mobile Tax", "topic": "extreme-quantization", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does using `0.5` instead of `0.5f` slow ARM NEON bounding-box postprocessing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0594", "title": "The WebView WebGL Throttle", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What OS-level security mechanism is strangling the WebGL performance within the mobile WebView?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0595", "title": "The CPU-GPU Asynchronous Desync", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What pipeline rule did you break?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 3}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0596", "title": "The Native Bridge Array Copy", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does passing an image over the React Native bridge add 50ms, and how can it be bypassed for video?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0597", "title": "The Fusion Illusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the interaction between the CoreML runtime, the A17 Pro's hardware, and the pruned model to explain this disappointing result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0598", "title": "The Pruning Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can structured attention-head pruning run faster than more aggressive unstructured pruning on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0599", "title": "The Delegate Dilemma", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do 20 NPU-CPU graph partitions cause a 5x slowdown for a single custom operator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0600", "title": "The Sparse Illusion", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the trade-offs and justify which approach is superior for a mobile deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0601", "title": "The Generative Keyboard's Hidden Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much latency can fusing LayerNorm, GeLU, and Add save for a generative keyboard?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0602", "title": "The Speculative Speedup", "topic": "speculative-decoding", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can speculative decoding speed up a mobile 7B LLM despite adding a second draft model?", "chain_ids": ["mobile-chain-auto-secondary-017-32"], "chain_positions": {"mobile-chain-auto-secondary-017-32": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0604", "title": "The Cloud-Native Fallacy on Mobile", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the mobile SoC shared-memory transfer of a 512MB latent tensor still a serious power and latency problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0605", "title": "The Accelerator Selection Conundrum", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Given the heterogeneous nature of modern mobile SoCs (CPU, GPU, NPU), which accelerator would you primarily target for each model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0607", "title": "The Budget Phone Mystery", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is there a 10x performance gap between a flagship and a budget phone when both advertise NPU acceleration?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 3}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0608", "title": "The Streaming ASR Trade-off", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might the smaller RNN-T model be the right choice for live mobile captions despite lower benchmark accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0609", "title": "The Shared GPU Contention", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do a 12ms ML GPU kernel and 4ms UI render still drop frames, and what scheduling strategy prevents jank?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 4}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0610", "title": "The NPU Utilization Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 3B INT4 LLM on A18 Pro reach 100% Neural Engine utilization but only 15 tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0611", "title": "The Inference Timing Jitter", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What would you identify as the sources of timing variance and design a system that guarantees sub-10ms P99?", "chain_ids": ["mobile-chain-auto-019-05"], "chain_positions": {"mobile-chain-auto-019-05": 1}, "chain_tiers": {"mobile-chain-auto-019-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0612", "title": "The Heterogeneous Pipeline Director", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you approach scheduling and resource allocation across these different compute units?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0614", "title": "The CoreML ANE Fallback", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can using the dedicated AI accelerator make the model slower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0615", "title": "The Hardware-Aware NAS for Mobile", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should NAS target both sub-5ms Apple latency and sub-8ms Qualcomm latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0616", "title": "The 60 FPS Camera ML Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you deliver a smooth 60 FPS experience when the model takes longer than the frame time?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 4}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0617", "title": "The Depthwise Memory Bound", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why didn't the 8x reduction in math translate to an 8x reduction in time?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 3}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0618", "title": "The Google Tensor G4 TPU Trade-off", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the team choose Pixel 9 Pro Tensor G4 or Snapdragon 8 Gen 3 based only on 27 versus 45 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0620", "title": "The JNI Boundary Crossing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is JNI doing that consumes 13ms of overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0621", "title": "The CoreML Neural Engine Fallback", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did CoreML silently reject the Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0623", "title": "The Metal Shader Threadgroup Limit", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a Metal shader with 1024-thread groups run on newer hardware but crash on older devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0624", "title": "The Quantization Slowdown Paradox", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the potential causes of this system-level slowdown, despite the core operation getting faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0625", "title": "The Performance Cliff", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can adding unsupported Dynamic Kernel blocks collapse ANE performance despite only a 5% MAC increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0626", "title": "The Thermal Throttling Paradox", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does forcing maximum NPU performance result in worse sustained average latency than OS-managed clocks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0627", "title": "The Operator Fusion Fallacy", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is aggressive quantization from INT8 to INT4 likely to fail to improve latency here, and what is the true underlying system bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0628", "title": "The Speculative Decoding Memory Trap", "topic": "speculative-decoding", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does this time-saving algorithm lead to catastrophic memory failure on a mobile device, and what specific hardware constraint is being violated?", "chain_ids": ["mobile-chain-auto-secondary-017-32"], "chain_positions": {"mobile-chain-auto-secondary-017-32": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0629", "title": "The Pruning vs. Distillation Dilemma", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is 50% unstructured pruning a poor path to the 16ms mobile deadline, and what compression technique should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0630", "title": "The Speculative Pruning Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does combining unstructured pruning with speculative decoding make the mobile LLM worse than speculative decoding alone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0631", "title": "The Night-Blind Driver Monitor", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate this suggestion and predict the true, underlying reason for this catastrophic, light-dependent failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0632", "title": "The Multi-Node Latency Catastrophe", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did latency increase non-linearly instead of scaling gracefully, and what specific hardware interaction is the likely cause?", "chain_ids": ["mobile-chain-auto-secondary-017-08"], "chain_positions": {"mobile-chain-auto-secondary-017-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0633", "title": "The Voice Assistant That Froze The Speedometer", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Assess the situation: Why would an LLM scheduling optimization, designed to improve throughput, cause a catastrophic failure in the hard-real-time instrument cluster?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 3}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0634", "title": "The Real-Time Batching Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a throughput-oriented batching window violate the real-time translation latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0635", "title": "The Topology-Oblivious Optimization", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a topology-oblivious All-to-All optimization severely degrade intra-node MoE communication speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0636", "title": "The Heterogeneous Orchestrator", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you outline a system-level strategy for orchestrating these heterogeneous compute units, addressing potential bottlenecks and power concerns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0637", "title": "The Heterogeneous Execution Strategy", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Conv2D, custom attention, GELU, and dynamic control flow run across Hexagon NPU, Adreno GPU, and Kryo CPU to minimize latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0638", "title": "The Big.LITTLE Synchronization Trap", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does limiting execution to 4 threads outperform 8 threads for inference on a big.LITTLE CPU architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0639", "title": "The Federated Learning Device Heterogeneity", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does device heterogeneity break federated learning, and how do you fix the round completion rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0640", "title": "Architecting a Multi-Model On-Device AI Assistant", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system orchestrate and schedule the wake word, ASR, 3B LLM, TTS, and vision models given the memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0641", "title": "Building a Hardware-Adaptive Inference Engine", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a cross-platform inference engine that automatically adapts to each SoC's strengths without maintaining 5 separate model variants?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0642", "title": "The On-Device LLM System Design", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect memory, inference, and UX to run a 3B parameter LLM at ≥20 tokens/sec on an 8GB phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0643", "title": "The Multi-Modal Sensor Fusion System", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you architect the fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0645", "title": "The On-Device LLM Keyboard Power Drain", "topic": "speculative-decoding", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should speculative decoding be used to meet the AI keyboard's latency and sub-1W power requirements?", "chain_ids": ["mobile-chain-auto-secondary-017-32"], "chain_positions": {"mobile-chain-auto-secondary-017-32": 2}, "chain_tiers": {"mobile-chain-auto-secondary-017-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0646", "title": "The 'Live Scribe' Concurrency Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What scheduling architecture is needed for Live Scribe to manage NPU contention across streaming and interactive requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0647", "title": "The Automotive Assistant's Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architecture avoids priority inversion for an in-car assistant handling both urgent commands and long conversations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0648", "title": "The Sentient Dashboard: Compressing a Foundation Model for Real-Time Driver Monitoring", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What multi-stage compression strategy can productionize a 10B foundation model to meet strict real-time safety deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0649", "title": "The Phantom Reboot: Designing a Resilient Automotive ML Watchdog", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should the drowsiness detector recover from NaN freezes without triggering the vehicle control watchdog reboot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0650", "title": "The SolarSentry Dashcam Meltdown", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What dynamic architecture lets SolarSentry detect parked-car threats within thermal and battery constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0651", "title": "The Near-Miss Privacy Paradox", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid near-miss learning system minimizes cellular transfer cost while preserving privacy and safety validation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Stream all sensor data to the cloud via cellular network 24/7 to guarantee the highest fidelity training dataset.", "Run a hybrid system: local models flag potential near-misses but only transmit anonymized, compressed metadata.", "Deploy an untested, unvalidated federated learning model directly to the control systems of 1 million cars.", "Store all near-miss data locally until the user connects their car to an active Wi-Fi connection."], "correct_index": 1}}, {"id": "mobile-0652", "title": "The Sun-Soaked Sentry Problem", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three architectural decisions to meet these conflicting thermal, power, and performance requirements on an automotive SoC?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0653", "title": "The On-Device 7B LLM Mandate", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What system design can fit and run a mandated 7B AI Concierge model within a 2GB on-device memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0654", "title": "The Automotive Co-Pilot Conundrum", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the three main pillars of your technical plan to bridge this gap, how do they interact, and what are the expected quantitative gains from each?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0655", "title": "The Guardian Copilot's Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What preemptible scheduling architecture prevents low-priority Guardian copilot work from blocking safety-critical tasks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0656", "title": "The Sentry Mode Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hierarchical sensing and duty-cycling architecture meets the 72-hour Sentry Mode power, thermal, and responsiveness constraints?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 4}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0657", "title": "The Autonomous Dashcam's Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why is continuous powerful vision inference infeasible for the autonomous dashcam, and what duty-cycled architecture is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0658", "title": "The AutoScribe Jank Crisis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should transcription and LLM summarization be scheduled to avoid UI freezes and meet real-time deadlines?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 4}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0659", "title": "The In-Car LLM Mandate", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What portfolio of model and systems optimizations makes the 7B in-car AI Co-Pilot feasible without cloud connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0660", "title": "The On-Device Copilot Power Budget", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What multi-model speculative decoding system can meet the always-on phone assistant's latency and power targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0661", "title": "The Sentient Dashcam: Designing for Hostile Automotive Environments", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What adaptive control system should manage Sentry Mode power and thermals while meeting a 5-second detection deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0662", "title": "The AR Navigation Preemption Crisis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you schedule a 400ms VLM inference without starving a 30 FPS navigation task on a non-preemptible NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0663", "title": "The App Sandbox Memory Trap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Your models only use 23 MB — how can that cause OOM on a 4 GB device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0664", "title": "The Mobile LLM KV-Cache Squeeze", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What grows until the OS jetsams a 3B INT4 chat app after 10 or more turns?", "chain_ids": ["mobile-chain-auto-014-20"], "chain_positions": {"mobile-chain-auto-014-20": 0}, "chain_tiers": {"mobile-chain-auto-014-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0665", "title": "The Unified Memory Architecture Advantage", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural difference lets Apple's unified memory avoid copies that Qualcomm's shared DRAM still requires?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0666", "title": "The App Memory Pressure Levels", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a 1.2GB CoreML model be killed under iOS memory pressure, and how should weights be loaded to avoid Jetsam?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 2}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0668", "title": "The Budget Phone Crash", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 15 MB model crash a 4 GB phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0669", "title": "The \"Small Model, Big Latency\" Puzzle", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is a common, often overlooked factor causing this high latency on the CPU, and how would you investigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0670", "title": "The Quantization Bandwidth Boon", "topic": "extreme-quantization", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is a primary reason for this limited latency improvement on a memory-bound mobile NPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0671", "title": "The OOM Crash on Older iPhones", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 28 MB model cause out-of-memory crashes on 4 GB RAM devices, and how can it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0673", "title": "The Silent Eviction", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does returning to an iOS app with a 400 MB mmap CoreML model cause a 3-second freeze?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0674", "title": "The Memory Bandwidth Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is Model A consistently slower than Model B on a mobile NPU despite having the same total MAC operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0675", "title": "The Memory Bandwidth Throttling", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 1080p segmentation model drop from 30 FPS to 12 FPS when screen recording starts on the same memory bus?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 3}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0676", "title": "The Memory Map (mmap) Page Fault Freeze", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mmap of a 150 MB model freeze the first inference for 1.2 seconds but not later passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0677", "title": "The LPDDR5X Bandwidth Budget", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is the memory bandwidth sufficient for real-time token generation at 30+ tokens/second?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 1}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0678", "title": "The On-Device Training Storage Bloat", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where did the 4.1 GB of storage bloat come from during on-device fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0679", "title": "The On-Device Vector Search L2 Mismatch", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an HNSW search over unnormalized 384-dim note embeddings return tax documents for an Italy vacation query?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0680", "title": "The Invisible OOM Crash", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 100 MB Android ML model crash with OOM when there is 500 MB of free physical RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0681", "title": "The Phantom OOM Crash", "topic": "extreme-quantization", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can loading a 500 MB model OOM intermittently even when the phone reports gigabytes of free RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0682", "title": "The Memory-Mapped Page Fault", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mmap of a 100 MB TFLite model freeze the UI for 800 ms on the first Android inference?", "chain_ids": ["mobile-chain-auto-014-17"], "chain_positions": {"mobile-chain-auto-014-17": 0}, "chain_tiers": {"mobile-chain-auto-014-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0683", "title": "The Camera Pipeline Memory Contention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does camera-preview segmentation rise from 8 ms to 14 ms when the ISP uses LPDDR5?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0684", "title": "The Mobile Memory Controller Puzzle", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where is the missing 44% of theoretical LPDDR5X memory bandwidth during generation?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 2}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0686", "title": "The Memory-Mapped Weight Strategy", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can mmap cut a 300 MB model startup from 3s to under 500ms without shrinking the model?", "chain_ids": ["mobile-chain-auto-014-17"], "chain_positions": {"mobile-chain-auto-014-17": 1}, "chain_tiers": {"mobile-chain-auto-014-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0687", "title": "The On-Device RAG Memory Budget", "topic": "compound-ai-systems", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does a Galaxy S24 Ultra on-device RAG stack fit with a 3B INT4 LLM and 500K 768-dim embeddings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0688", "title": "The Multi-Model Memory Sharing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much memory does a shared-backbone architecture save on a 1.5 MB INT8 backbone, and what runtime tradeoff occurs on the Apple ANE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0689", "title": "The On-Device Fine-Tuning Corruption", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can you personalize a 100M image classifier without dropping original accuracy from 85% to 12%?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0690", "title": "The On-Device Image Generation Memory Wall", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the app get jetsammed at denoising step 12 even though 1.84 GB of weights fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0691", "title": "Quantization Strategy for On-Device Updates", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "When should a mobile segmentation team choose QAT over DRQ, and when is DRQ better for OTA model iteration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0692", "title": "Unpredictable Latency Spikes", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose and mitigate these latency spikes, focusing on memory management within the ML inference pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0693", "title": "The DRAM Bandwidth Contention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose and mitigate memory bandwidth contention between concurrent ML models and UI rendering?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 2}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0694", "title": "The JNI Object Pinning Death", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does JNI GetByteArrayElements make Android UI drop to 2 FPS after pinning 3 MB camera frames at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0695", "title": "The SLC Cache Eviction", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does UI scrolling ruin the ML power budget of an always-on audio classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0696", "title": "The On-Device LLM Memory Architecture", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can a 3B FP16 LLM run on a phone when weights need 6 GB and the OS leaves 5 GB free?", "chain_ids": ["mobile-chain-auto-014-20"], "chain_positions": {"mobile-chain-auto-014-20": 2}, "chain_tiers": {"mobile-chain-auto-014-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0697", "title": "The Custom Allocator Architect", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a custom memory allocator optimized for the deterministic nature of ML inference workloads on mobile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0699", "title": "The CoreML Multi-Array Pre-allocation", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you eliminate per-frame MLMultiArray allocation spikes in a 60 FPS Core ML video pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0700", "title": "The Quantization Quirk", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why would some operations run in FP32 on the CPU despite the model being quantized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0701", "title": "The Conversion Precision Loss", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which layer type is most likely the culprit, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0702", "title": "The Zero-Point Drift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a TFLite INT8 activation hit 0 on-phone after a PyTorch model tested at 95% accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0703", "title": "The Cross-SoC Accuracy Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the same INT8 face verification model have 0.1% FAR on Pixel 8 Pro but 1.2% on Galaxy S23?", "chain_ids": ["mobile-chain-auto-017-05"], "chain_positions": {"mobile-chain-auto-017-05": 0}, "chain_tiers": {"mobile-chain-auto-017-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0704", "title": "The Cross-Platform Confidence Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where do CoreML and TFLite INT8 conversion differences make iOS show 87% confidence while Android shows 71%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0705", "title": "The Adaptive Precision Challenge", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you adapt your quantization strategy to maintain accuracy while still leveraging INT8 performance on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0706", "title": "The Quantization Fragmentation Trap", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can the same INT8 model produce different results on different SoCs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0707", "title": "The Neural Engine Quantization Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What quantization choice should be used on Apple ANE versus Qualcomm Hexagon, and where is each platform's cliff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0708", "title": "The Quantization Conundrum", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What architectural reasons within the NPU or its surrounding SoC might explain this counter-intuitive result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0709", "title": "The Int8 Quantization Activation Clipping", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What property of mobile activation functions causes Layer 5 activations to saturate at 127 after INT8 quantization on a mobile DSP?", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 1}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0710", "title": "The Mixed-Precision Deployment Plan", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a mixed-precision strategy that gets Neural Engine speed with FP32 accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0711", "title": "The INT4 Weight-Only Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is W4A16 a better quantization scheme than W4A4 despite keeping activations at 16-bit precision?", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 2}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0712", "title": "The Quantization Divergence Across SoCs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can the same INT4 Gemma-2B weights produce different text on A18 Pro, Snapdragon 8 Gen 3, and Tensor G4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0713", "title": "The Cross-SoC Quantization Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does INT8 precision drop by 6% on the Exynos 2200, and how do you fix it without maintaining three separate models?", "chain_ids": ["mobile-chain-auto-017-05"], "chain_positions": {"mobile-chain-auto-017-05": 1}, "chain_tiers": {"mobile-chain-auto-017-05": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0714", "title": "The INT4 Accuracy Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the significant technical challenges you anticipate with INT4 quantization compared to INT8, and how would you mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0715", "title": "The Granular Precision Architect", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a hardware-aware mixed-precision quantization strategy for a mobile generative model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0716", "title": "The Cross-Platform Confidence Score Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why do the confidence scores diverge by 0.45, and how do you guarantee cross-platform consistency for safety-critical applications?", "chain_ids": ["mobile-chain-auto-017-05"], "chain_positions": {"mobile-chain-auto-017-05": 2}, "chain_tiers": {"mobile-chain-auto-017-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0717", "title": "The Delegation Lottery", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does adding a single TFLite Flex post-processing op raise the inference latency from 4ms to 38ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0718", "title": "The Heterogeneous Scheduling Trap", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where does the pipeline spend 28ms if the NPU work accounts for under 0.5% of peak performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0719", "title": "The Mobile GPU Misconception", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the GPU 3× slower than the NPU for this workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0720", "title": "The SoC Interconnect Bottleneck", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is eating the other 9ms in the pipelined execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0721", "title": "The Qualcomm QNN SDK Delegation", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should QNN delegate each model operation across NPU, GPU, and CPU to minimize latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0722", "title": "The ANE Delegation Disaster", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the same Core ML segmentation model run in 6 ms on the A17 Pro but 69 ms on the older A15 Bionic?", "chain_ids": ["mobile-chain-auto-secondary-007-12"], "chain_positions": {"mobile-chain-auto-secondary-007-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0723", "title": "The Dilated Convolution Penalty", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the dedicated AI hardware 4x slower than the general-purpose CPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0724", "title": "The UI Contention Crisis", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an iPhone 15 Pro 7B local LLM drop from 25 to 5 tokens/sec when users scroll a 3D interface?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0725", "title": "The NPU Compiler Black Box", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do three compilers targeting three NPUs produce different partitioning decisions from the same ONNX graph, and how do you debug this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0726", "title": "The Mobile AI Chip Roadmap Bet", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you build a model deployment pipeline that survives this hardware fragmentation without maintaining 5 separate codepaths?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0727", "title": "The Interconnect Choke Point", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What SoC-level component likely causes the discrepancy between sum of stage times and end-to-end latency, and how is its impact quantified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0728", "title": "The Asymmetric Multiprocessing (Big.LITTLE) Stutter", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architectural feature of mobile CPUs causes this massive latency variance, and how can you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0729", "title": "The \"Unaccelerated Custom Op\" Dilemma", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you accelerate a GNN custom aggregation op that causes a 20 ms CPU spike on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0730", "title": "The ISP/NPU Hardware Synchronization", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What hardware synchronization mechanism prevents an NPU from reading an ISP hardware buffer before DMA writes are visible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0736", "title": "Zero-Shot UI Action Grounding", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design the on-device grounding architecture and the handoff protocol under these strict memory and latency constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0737", "title": "Local-Local RAG & On-Device Telemetry", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you architect the local vector store, the CoreML embedding pipeline, and the LDP mechanism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0738", "title": "The Semantic Router", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What router architecture meets <10 ms latency, 5-turn context, <50 MB RAM, and Neural Engine execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0739", "title": "CoreML vs PyTorch", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Can you deploy a raw `.pt` or `.pth` PyTorch model directly into a native iOS Swift application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, iOS natively executes raw PyTorch files (.pt) using the Swift Torch API.", "Yes, but it requires FP16 conversion via `torch.half()` before loading.", "No, the model must be converted to the CoreML format for hardware acceleration on iOS.", "No, iOS only natively supports TensorFlow Lite models (.tflite)."], "correct_index": 2}}, {"id": "mobile-0740", "title": "Background Inference Limits", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary risk of running heavy ML inference in the background on iOS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model will automatically downgrade to 1-bit quantization.", "The OS watchdog will forcefully terminate the app process to save battery and RAM.", "The user's screen will freeze until the background task completes.", "The App Store will reject the app during the review process."], "correct_index": 1}}, {"id": "mobile-0741", "title": "The Launch Blocker", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on the launch timeline, why are you seeing a 95% abandonment rate?", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0742", "title": "The Operator Gap", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the DMA flow, why do two unsupported ops cause a 50% latency penalty in a mobile NPU deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0743", "title": "The Jetsam Guillotine", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the RAM diagram, what is the 'invisible' resource consumer causing the OS to kill your process?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 3}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0744", "title": "The Burst Benchmarking Illusion", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the performance timeline, what physical protection mechanism is engaging inside the smartphone?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 3}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0745", "title": "The Backbone Bloat", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the VRAM diagram, what is the 'efficiency gap' in your model loading strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0746", "title": "The Frankenstein Model", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "After a reboot interrupts a background model update, why can the app launch but produce garbage outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0747", "title": "The CPU Wake-Lock Tax", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the power domain diagram, where is the energy going?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0748", "title": "The Silicon Shared Oven", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the SoC thermal envelope, why is the 3D game affecting the NPU speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0749", "title": "The Radio Energy Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the power breakdown, where is your optimization effort being wasted?", "chain_ids": ["mobile-chain-bucket-powerbud-06"], "chain_positions": {"mobile-chain-bucket-powerbud-06": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0750", "title": "The UMA Bandwidth Wall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the Unified Memory Architecture, why does a faster 120Hz screen slow down your AI models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0751", "title": "The ANE Efficiency Advantage", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the compute efficiency in TOPS per watt, and why does this metric matter more than raw TOPS for a mobile device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.5 TOPS/W. Mobile chips sacrifice efficiency for portability.", "7.0 TOPS/W. The battery is the binding constraint, so efficiency determines the thermal limit.", "35.0 TOPS/W. The 5W figure represents idle power.", "175.0 TOPS/W. NPUs achieve superlinear scaling at INT8."], "correct_index": 1}}, {"id": "mobile-0752", "title": "The 48MP Camera Firehose", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Before any ISP processing or compression, what is the raw data bandwidth flowing from the sensor to the SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~30 MB/s. Each 48MP frame is about 1MB compressed.", "B) ~288 MB/s. 48MP at 1 byte per pixel at 6 fps.", "C) ~2.9 GB/s. 48MP at 2 bytes per pixel at 30 fps, uncompressed.", "D) ~29 GB/s. 48MP at 20 bytes per pixel with full color depth."], "correct_index": 2}}, {"id": "mobile-0753", "title": "The On-Device Fine-Tuning Data Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much memory does this training dataset occupy, and does it fit comfortably in the unified memory of a mobile device alongside the model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~6 MB. JPEG images are small and efficient.", "B) ~60 MB. Each tensor is about 60 KB.", "C) ~600 MB. Each 224x224x3 FP32 tensor is ~600 KB, times 1,000 images.", "D) ~6 GB. Deep learning datasets always require gigabytes."], "correct_index": 2}}, {"id": "mobile-0754", "title": "The INT4 Quantization Payoff", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory does the INT4 model save, and what is the primary risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 0.75 GB savings. INT4 is only 25% smaller than INT8.", "B) 1.5 GB savings. INT4 uses half the bytes of INT8, but risks accuracy loss in sensitive layers.", "C) 3.0 GB savings. INT4 eliminates half the parameters entirely.", "D) No savings. INT4 still requires INT8 storage with a lookup table."], "correct_index": 1}}, {"id": "mobile-0755", "title": "The Battery Inference Budget", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is this feasible with inference alone, ignoring all other phone functions?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 0}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes. 18.5 Wh / 5W = 3.7h of inference, plus the OS is free.", "No. Inference alone drains the battery in 3.7h; with system overhead it is ~2.3h. Must reduce frame rate.", "Yes. The ANE uses only milliwatts for neural inference.", "No, but only because the phone will thermal-throttle first."], "correct_index": 1}}, {"id": "mobile-0756", "title": "The Depthwise Separable Convolution Dividend", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "By what factor does this reduce the multiply-accumulate operations per spatial position?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~2.0x (Trap: assuming it just splits one convolution into two equal halves).", "B) ~4.5x (Trap: omitting the pointwise projection cost).", "C) ~8.7x (Correct).", "D) ~256x (Trap: assuming it scales perfectly with channel depth alone)."], "correct_index": 2}}, {"id": "mobile-0759", "title": "The Vocabulary Embedding Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What fraction of a 3.5 GB INT4 model does the embedding table represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~26 MB", "~262 MB", "~1 GB", "~2.6 MB"], "correct_index": 1}}, {"id": "mobile-0760", "title": "The Speculative Decoding Gambit", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the effective token throughput with speculative decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 25 tok/s.", "B) 10 tok/s.", "C) 5 tok/s.", "D) 20 tok/s."], "correct_index": 1}}, {"id": "mobile-0761", "title": "The Cellular Model Delivery Problem", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the download time and propose a deployment strategy that respects mobile network constraints?", "chain_ids": ["mobile-chain-auto-secondary-005-14"], "chain_positions": {"mobile-chain-auto-secondary-005-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-005-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0762", "title": "The Federated Learning Upload Bill", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the total daily upload bandwidth and explain why naive federated learning is infeasible at scale without compression?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0764", "title": "The Privacy Budget Drain", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does advanced composition improve this, and what is the practical implication for query budgeting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0765", "title": "The On-Device Training Memory Crisis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much memory does gradient checkpointing save for 32-layer FP16 activations, and what compute cost does it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0767", "title": "The Mobile MoE Memory Illusion", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why must an 8-expert INT4 MoE keep all 2GB of experts resident on mobile despite top-2 routing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0768", "title": "The On-Device RLHF Memory Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the specific memory stacks involved, and is on-device RLHF feasible within 8 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0769", "title": "The Replay Buffer Memory Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you calculate the buffer size and design a memory-aware eviction policy that prevents unbounded growth on a memory-constrained device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0770", "title": "The Million-Device Adapter Sync Storm", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the server-side egress bandwidth required and cost for syncing 10MB adapters every 5 minutes to 1 million users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0771", "title": "The Federated Gradient Compression Trade-Off", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the per-round upload volume and analyze the convergence trade-off of aggressive compression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0772", "title": "The Model Distillation Sync Budget", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a sync protocol that keeps the student model fresh without exceeding the 100 MB daily cellular data budget?", "chain_ids": ["mobile-chain-auto-secondary-005-14"], "chain_positions": {"mobile-chain-auto-secondary-005-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-005-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0773", "title": "The Silent Model Corruption Problem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a verification system that detects such corruption before inference begins?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0774", "title": "The Thermal Throttling Adaptation Loop", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design an adaptive inference strategy that maintains acceptable user experience under thermal constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0775", "title": "Diagnosing Core ML ANE Fallback", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the architectural root cause of this execution pattern, and how can it be analyzed and resolved to achieve target performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0776", "title": "CoreML Execution Fallback Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total CoreML frame latency and sustainable FPS after adding ANE compute, CPU fallback, and two copy penalties?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0777", "title": "CoreML ViT ANE Fallback Evaluation", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which optimization path best fixes ViT CoreML ANE-GPU fallback and meets the 60 FPS frame budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0778", "title": "TFLite Delegate Graph Partitioning Trade-offs", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs and justify a design decision to meet both the 33ms latency and 1.5W power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0779", "title": "Diagnosing TFLite NNAPI Delegate Subgraph Fallback", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can enabling the TFLite NNAPI delegate make a MobileNetV3 model slower than CPU inference?", "chain_ids": ["mobile-chain-auto-019-04"], "chain_positions": {"mobile-chain-auto-019-04": 1}, "chain_tiers": {"mobile-chain-auto-019-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0780", "title": "TFLite Delegate Subgraph Partitioning Overhead", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If each CPU-GPU context switch (tensor copy and synchronization) takes 2ms, what is the expected new end-to-end latency?", "chain_ids": ["mobile-chain-auto-019-04"], "chain_positions": {"mobile-chain-auto-019-04": 0}, "chain_tiers": {"mobile-chain-auto-019-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0781", "title": "Edge Vision Model: CNN vs ViT Deployment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architecture is better suited for an 8 TOPS INT8 NPU with 2 MB SRAM: MobileNetV2 or a Mobile ViT of equivalent accuracy, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0782", "title": "The NAS Latency Predictor Blind Spot", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did an NPU latency LUT predict 8ms but produce 22ms inside a full camera pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0783", "title": "Hedged Edge-Cloud Model Routing", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "When should the voice translation app launch a local hedged fallback to guarantee a 400ms P99 latency bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0784", "title": "Thermal-Aware On-Device Pipelining", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can asynchronous pipelining preserve 30 FPS after GPU thermal throttling increases one stage to 25ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0785", "title": "Battery-Aware Sensor Batching", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What sensor batching window minimizes smartwatch CPU duty cycle while preserving fall detection within a 1000ms SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0787", "title": "Speculative Decoding Performance Regression", "topic": "speculative-decoding", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does speculative decoding reduce mobile throughput despite a 65% draft acceptance rate?", "chain_ids": ["mobile-chain-auto-secondary-017-33"], "chain_positions": {"mobile-chain-auto-secondary-017-33": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0788", "title": "The Context Length Latency Spike", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mobile chat decode latency jump exactly at a 1024-token context length?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0789", "title": "Multi-Turn Chat Degradation", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does multi-turn mobile chat TTFT degrade as history grows while decode speed stays constant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0790", "title": "The Code Generation Stall", "topic": "speculative-decoding", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does speculative decoding become slower than target-only decoding for Python code generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0791", "title": "The Background Batching Bottleneck", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does adding a background summarizer reduce total LLM throughput instead of improving batching efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0792", "title": "The INT8 KV Cache Penalty", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does quantizing the KV cache to INT8 save memory but increase decode latency and lower hardware utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0793", "title": "The PagedAttention System Call Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a PagedAttention port with 16-token blocks cause CPU overhead and decode jitter on Android?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0794", "title": "AR Glasses Thermal Budgeting", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Does the AR glasses ML workload fit the 1.5W thermal envelope, and how should average power be calculated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0795", "title": "Federated Learning Carbon Trade-offs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the daily carbon footprint of the mobile federated learning approach including compute, 5G transfer, and cloud aggregation?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 3}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0796", "title": "Always-On Wake Word Battery Drain", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the percentage of battery drained over 24 hours for the DSP approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0797", "title": "LLM DVFS and Static Power Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the energy per token for both the max frequency and 50% frequency states to determine if downclocking actually saves energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0798", "title": "Super-Res NPU vs 5G Streaming Energy", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the total energy saved (in Joules) by using the ML super-resolution approach over the 2-hour movie?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0799", "title": "Thermal Throttling in Background Indexing", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the maximum number of images the device can index per minute without violating the thermal limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0800", "title": "On-Device vs Cloud LLM Energy Cost", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much more total system energy (in Joules) does the on-device approach use compared to the cloud approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0802", "title": "Memory-Bound NPU Execution Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the layer take 0.5ms instead of the compute-bound 0.036ms estimate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0803", "title": "Unaligned Channel Compiler Fallback", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 13-channel Cortex-M4 convolution be 3x slower than a 16-channel convolution with more MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0804", "title": "Activation Function CPU Fallback", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did replacing ReLU with SiLU make the mobile NPU segmentation model jump from 20ms to 850ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0805", "title": "Dispatch Overhead in Tiny Models", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 50KB dense model run faster on a Cortex-A CPU than on the mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0806", "title": "Data Reuse in Depthwise Convolutions", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can a depthwise convolution with 10x fewer MACs be slower than a pointwise convolution on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0807", "title": "Activation Spilling and SRAM Overflow", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a high-resolution input make the Cortex-M4 model spike from 50ms to 500ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0808", "title": "SRAM Weight Pinning and Power Drain", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 10MB mobile NPU model draw unexpected power by repeatedly reading weights from DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0809", "title": "DMA Double-Buffering Overhead", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does DMA fail to hide memory latency when the Cortex-M4 waits for each tile before computing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0810", "title": "KV Cache Tiling and SRAM Thrashing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does KV cache tiling latency grow quadratically as sequence length increases from 100 to 1000?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0811", "title": "Fake Quantization CPU Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did INT8 quantization reduce MobileNet size but not CPU inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0812", "title": "INT8 Accumulator Overflow", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do all INT8 DSP intermediate tensors saturate to 127 and destroy accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0813", "title": "Asymmetric Quantization Runtime Overhead", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does asymmetric INT8 quantization increase mobile CPU latency compared with symmetric quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0814", "title": "Mobile New 0014", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a mixed INT8/INT16 mobile network run slower than an all-INT16 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0815", "title": "Per-Channel Quantization Memory Stalls on DSP", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does per-channel INT8 quantization stall DSP vector ALUs compared to per-tensor quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0816", "title": "APK Compression Breaking Zero-Copy Mmap", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a compressed 20MB TFLite model freeze the Android UI for 4 seconds during first initialization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0817", "title": "Selective Building for Edge ML Binaries", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 2MB keyword spotting model produce a 115MB IoT binary, and how should the runtime be built instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0818", "title": "Memory Fragmentation Causing NPU OOM Errors", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does loading a 50MB model fail with an OOM error when 120MB of free RAM is available?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0819", "title": "Flatbuffer Alignment Shifts in OTA Updates", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can changing one bias value make an OTA delta for a 100MB model nearly as large as the full model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0820", "title": "DRAM Power Bottleneck in High-FPS Edge Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the AR headset drain battery despite only 20% NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0821", "title": "Polling vs Event-Driven Execution in Always-On MCU", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the always-on Cortex-M4 wake-word detector die in days instead of lasting 30 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0822", "title": "Thermal Leakage Power Runaway in Mobile SoCs", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does SoC power rise from 3W to 4.5W over time despite constant MACs per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0823", "title": "CPU Preprocessing Overhead in Edge Vision Pipelines", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the drone vision pipeline consume high baseline power before NPU inference starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0824", "title": "Semantic Gating for Always-On Object Detection", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the empty-hallway security camera drain battery as fast as a busy scene?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0825", "title": "DVFS Thermal Throttling in Continuous Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Orin tracking latency abruptly double after four minutes despite normal ambient temperature?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0826", "title": "Ergonomic Skin Temperature Limits on Mobile NPUs", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the smartphone OS throttle the NPU even though die temperature is only 50C?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0827", "title": "Thermal Saturation in Passively Cooled AR Headsets", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a passively cooled AR headset gradually fall from 60 FPS to 15 FPS in a one-hour test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0828", "title": "Ambient Temperature Impact on Passively Cooled Edge Servers", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an edge server in a closed metal box reboot during the day but work at night?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0829", "title": "Memory Bus Contention Between Concurrent Edge Models", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the audio wake-word model miss deadlines only when video upscaling runs concurrently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0830", "title": "UI Thread Preemption of Mobile ML Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does on-device translation token latency spike when the user scrolls quickly through the UI?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0831", "title": "Kernel Fusion Misses and Memory I/O Latency", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an unfused Conv-BatchNorm-ReLU block take 1.8ms instead of one 0.5ms GPU kernel?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0832", "title": "Layout Mismatch Transpose Overheads in TFLite", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does TFLite insert expensive Transpose operations after converting a PyTorch model for a mobile DSP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0833", "title": "NPU Memory Tiering Specification", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a layer execution and weight-tiering specification to maximize the auto-regressive token generation rate, detailing how to utilize the fast SRAM.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0836", "title": "Mixed-Precision Mobile Super-Resolution", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the mechanism of the accuracy drop, and what mixed INT8/FP16 execution graph specification resolves it while preserving throughput?", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 3}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0840", "title": "Mitigating Mobile DRAM Thermal Throttling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate an architectural change to the model's execution strategy to eliminate off-chip intermediate activation memory traffic.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0850", "title": "Analyze MobileNetV3 latency on A17 Pro ANE vs GPU execution path", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the ANE 2.3x faster than the GPU for MobileNetV3, and which specific operations benefit most?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0851", "title": "Analyze EfficientNet-Lite0 vs MobileNetV3 accuracy-energy tradeoff on Snapdragon 8 Gen 3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does EfficientNet-Lite0 compare to MobileNetV3-Large in energy consumption per inference on the Snapdragon 8 Gen 3 Hexagon DSP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0852", "title": "Diagnose accuracy regression after Core ML conversion of EfficientNet-B0 for A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the sources of the 2.3pp accuracy regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0853", "title": "Evaluate depthwise separable vs standard conv for on-device training on Snapdragon 8 Gen 3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate whether depthwise separable convolutions or standard convolutions in these final blocks better balance training speed and accuracy for on-device learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0854", "title": "Evaluate EfficientNet-Lite vs MobileNetV3 for CoreML ANE deployment accuracy-latency", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which CNN backbone should be selected for an accuracy-critical iOS app running on the ANE, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0855", "title": "Fluency: explain MobileNet-family tradeoffs to an iOS product manager", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the MobileNet vs ResNet-50 accuracy, speed, and battery tradeoffs to a product manager?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0856", "title": "Fluency: describe inverted residual block execution on Qualcomm Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a MobileNetV2 inverted residual block map to Hexagon HTP, and when is expansion ratio 6 too large?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0857", "title": "Implement MobileNetV3 fine-tuning pipeline for Core ML deployment on A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you fine-tune and convert MobileNetV3-Large so it runs on the Neural Engine under 3ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0858", "title": "Implement EfficientNet-B0 for Snapdragon 8 Gen 3 Hexagon NPU via QNN SDK", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should EfficientNet-B0 be converted and optimized for Snapdragon 8 Gen 3 Hexagon deployment with the QNN SDK?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0859", "title": "Mastery: explain compound scaling failure modes for mobile deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Does EfficientNet-B3 fit the unchanged 5ms mobile NPU SLA, and where does compound scaling break down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0860", "title": "Mastery: MobileNetV3 SE block impact on A17 Pro ANE vs Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should MobileNetV3 SE blocks be handled on A17 Pro ANE versus Snapdragon Hexagon, and what latency overheads matter?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0861", "title": "Optimize MobileNetV2 expansion ratio for Snapdragon 8 Gen 3 Hexagon HTP memory", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should MobileNetV2 expansion ratios be adjusted to reduce early activation memory pressure on Hexagon HTP?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0862", "title": "Optimize EfficientNet-Lite0 for continuous inference battery life on A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can EfficientNet-Lite0 continuous AR inference cut battery drain below 4% per hour without changing architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0863", "title": "Realization: ship MobileNetV3 model update OTA with Core ML compilation on device", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a MobileNetV3 Core ML OTA update be delivered, compiled, validated, and rolled back within the setup target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0864", "title": "Realization: build multi-model CNN ensemble on Snapdragon 8 Gen 3 with CPU+NPU split", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a MobileNetV3-Small and EfficientNet-Lite0 ensemble be split across Snapdragon compute units under 8ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0865", "title": "Recall depthwise separable convolution parameter count for MobileNetV2 first block", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How would you calculate the exact parameter count and FLOPs for this block and explain why t=1 is used only in the first block?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0866", "title": "Specification: define mobile CNN requirements for real-time AR on A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What technical specification should define this real-time AR mobile CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0867", "title": "Specification: model format and versioning requirements for cross-platform mobile CNN deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What packaging, versioning, and validation spec should govern cross-platform MobileNetV3 deployment to Core ML and QNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0868", "title": "Analyzing Multi-Model Pipeline Memory Pressure on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory pressure and identify which models can be hot-swapped vs kept resident to stay within the 4GB app memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0869", "title": "Analyzing Compound AI Pipeline Thermal Throttling on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the thermal contribution of each pipeline stage and determine which stage to optimize first to recover performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0870", "title": "Analyzing On-Device vs Cloud Offload Decision for Compound AI on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze under which conditions on-device vs cloud is preferred, considering battery, latency, and privacy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0871", "title": "Designing an On-Device RAG Architecture for A17 Pro with 8GB Memory", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an on-device RAG system fit document search and LLM generation within memory and response-time constraints?", "chain_ids": ["mobile-chain-auto-secondary-003-01"], "chain_positions": {"mobile-chain-auto-secondary-003-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-003-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0872", "title": "Designing a Resilient Agent Fallback Architecture for Intermittent Connectivity on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a mobile compound AI agent switch gracefully between cloud and on-device modes during intermittent connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0873", "title": "Diagnosing Slow First-Token Latency in On-Device RAG on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Tensor G3 on-device RAG have 4.2 s TTFT with a loaded 1B INT8 LLM and three 400-token documents?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0874", "title": "Diagnosing Context Overflow Failures in Multi-Turn Mobile Agents on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose when context overflow occurs and design a prevention strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0875", "title": "Diagnosing Retrieval Quality Degradation After App Update on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why retrieval quality degraded and quantify the impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0876", "title": "Evaluating Embedding Model Quality vs Size Tradeoff on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which model maximizes quality within a 200ms retrieval budget per session and a 100MB memory limit for the embedding model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0877", "title": "Evaluating Speculative Decoding Viability for On-Device LLM on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate whether speculative decoding provides a net speedup given the verification overhead on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0878", "title": "Evaluating End-to-End Compound Pipeline Quality on Tensor G3 with a RAG Evals Suite", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the gap between individual stage metrics and the 0.61 end-to-end satisfaction score?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0879", "title": "Explaining Compound AI Pipeline Stages to a Mobile App Developer on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does a compound AI pipeline need retrieval instead of feeding Core ML image features directly to a 1B LLM?", "chain_ids": ["mobile-chain-auto-secondary-003-01"], "chain_positions": {"mobile-chain-auto-secondary-003-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-003-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0880", "title": "Explaining Model Routing Tradeoffs in Compound AI to a Product Manager", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can't we just always use the best, most powerful model for every request on a mobile device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0881", "title": "Explaining Token Budget Constraints for Compound AI on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is the context window so much smaller on mobile, and why does it matter for compound AI?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0882", "title": "Implementing a Streaming Response Pipeline for Compound AI on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the compound AI pipeline stream tokens so the first response appears within 200ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0883", "title": "Implementing Background Index Sync for On-Device RAG on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the background sync architecture with power and latency constraints to prevent battery drain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0884", "title": "Mastering Compound AI System Reliability Engineering on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a reliability architecture that achieves a 99.5% successful response rate for this compound pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0885", "title": "Mastering End-to-End Optimization of a 4-Stage Compound Pipeline on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a comprehensive optimization strategy that achieves this target without changing models or hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0886", "title": "Mastering On-Device Knowledge Base Update Strategy for Compound AI on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a knowledge base update strategy that maintains retrieval quality, minimizes battery impact, and handles the case where updates arrive while the user is actively using the app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0887", "title": "Optimizing Retrieval Chunk Size for On-Device RAG on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the memory and latency impact and determine whether the quality improvement justifies the cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0888", "title": "Optimizing Prefill Latency via Prompt Compression on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the net latency improvement and decide whether prompt compression is worthwhile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0889", "title": "Optimizing Agent Tool-Call Frequency to Reduce Battery Drain on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can the Tensor G3 agent reduce tool-call energy by at least 40% without reducing task quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0890", "title": "Realizing a Compound AI App with CoreML Multi-Model Orchestration on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should CoreML load and schedule a three-model compound AI pipeline within a 600MB A17 Pro process budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0891", "title": "Realizing On-Device Agent State Persistence Across App Backgrounding on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should an Android agent persist lightweight state so it can resume within 500ms after backgrounding without saving the entire KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0892", "title": "Realizing a Hybrid On-Device and Cloud Compound AI Pipeline on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a hybrid compound AI pipeline decide between cloud and on-device generation while preserving context privacy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0893", "title": "Recalling Compound AI Pipeline Components and Their Roles on Mobile", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the standard components in an on-device RAG pipeline, and what does each one do?", "chain_ids": ["mobile-chain-auto-secondary-003-01"], "chain_positions": {"mobile-chain-auto-secondary-003-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-003-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0894", "title": "Specifying Latency SLO Contracts for Each Stage of a Mobile Compound Pipeline on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What per-stage latency SLOs and error budgets should a five-stage pipeline use for a 1.5s TTFT SLA?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0895", "title": "Specifying Model Update and Rollback Contract for Compound AI on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What OTA model update and rollback protocol can detect a bad compound AI model and revert within one hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0896", "title": "Dataset Curation: Design On-Device Data Collection for Mobile Model Training", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the data collection, quality filtering, and privacy-preserving local storage pipeline?", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 1}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0897", "title": "Dataset Curation: Evaluate On-Device vs Cloud Data Curation for Mobile ML", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should photo-enhancement data curation run on-device or in the cloud for privacy, latency, label quality, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0898", "title": "Dataset Curation: Evaluate Federated Data Heterogeneity Impact on Mobile Model Quality", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the impact of data heterogeneity on global model quality and propose a mitigation strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0899", "title": "Dataset Curation: Fluency — Token Budget for On-Device Fine-Tuning on A17 Pro", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you estimate the total fine-tuning compute, memory required, and time to complete?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0900", "title": "Dataset Curation: Implement Privacy-Preserving Data Collection on Snapdragon", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should DP-SGD be implemented for federated gradients on mobile devices to achieve epsilon 1.0, and what are the required parameters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0901", "title": "Dataset Curation: Implement Data Versioning for Mobile Federated Learning", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should a federated learning system version-gate devices and data when model versions differ across the fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0902", "title": "Dataset Curation: Mastery — On-Device Personalization Data Strategy for 1B Mobile Users", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the complete on-device personalization data strategy (collection, privacy, local curation, federated aggregation, and global model updates)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0903", "title": "Dataset Curation: Mastery — Sensor Fusion Dataset Design for On-Device Health Model", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the multi-modal data curation pipeline, synchronization strategy, and privacy-preserving collection protocol to fit the storage, sync, compute, and privacy plan for the target device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0904", "title": "Dataset Curation: Optimize Data Pipeline for Federated Learning on Mobile", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How can the Tensor G3 federated learning preprocessing pipeline be optimized when preprocessing dominates local training time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0905", "title": "Dataset Curation: Optimize Label Quality vs Cost Tradeoff for Mobile App Training", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What tiered labeling strategy can meet a $10,000 budget and near-90% label accuracy for 500K mobile photos?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0906", "title": "Dataset Curation: Realize Training Data Requirements for On-Device LLM Fine-Tuning", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What data and LoRA fine-tuning plan lets a 1B email assistant fit and train on mobile constraints within five minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0907", "title": "Dataset Curation: Realize Cross-Device Dataset Consistency for Mobile Fleet", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should cross-device federated learning enforce consistent tokenizers, data formats, and label schemas?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0908", "title": "Dataset Curation: Recall — What is Federated Learning Data Privacy?", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does federated learning mean for mobile data privacy, and how does it differ from centralized data collection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0909", "title": "Dataset Curation: Specification — Mobile Data Quality SLA for RLHF Personalization", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What data quality SLAs should gate on-device RLHF personalization updates on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0910", "title": "Fault Tolerance: Analyze Checkpoint Overhead for On-Device Fine-Tuning on A17 Pro", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the expected daily failure rate during a 10-minute training window, and how does it impact checkpoint frequency?", "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 1}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0911", "title": "Fault Tolerance: Design Model Update Safety Protocol for Mobile LLM", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a fault-tolerant update protocol that limits the blast radius of a bad update to < 0.01% of total users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0912", "title": "Fault Tolerance: Evaluate On-Device vs Cloud Recovery for Mobile ML State", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the recovery time, user experience, and privacy implications of cloud recovery vs local rebuilding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0913", "title": "Fault Tolerance: Evaluate App Crash vs Model Corruption Recovery Paths", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compare the recovery protocol, recovery time, and user impact for each failure mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0914", "title": "Fault Tolerance: Fluency — Mobile Checkpoint Write Speed and Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the checkpoint write time and energy overhead if saving every 60 seconds during a 10-minute background training session?", "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 0}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0915", "title": "Fault Tolerance: Implement Atomic Model Swap for Zero-Downtime Mobile Update", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should an atomic mobile model swap update a 500MB LLM without interrupting in-flight inference and still support rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0916", "title": "Fault Tolerance: Implement Differential Checkpoint for Mobile LoRA Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should differential checkpointing reduce mobile LoRA checkpoint size while preserving recovery correctness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0917", "title": "Fault Tolerance: Mastery — Mobile Personalization Reliability Architecture", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Design a complete fault tolerance architecture that preserves personalization state across all these events.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0918", "title": "Fault Tolerance: Mastery — Systematic Failure Mode Analysis for Mobile ML", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the top 5 failure modes, their probability, impact, and mitigations with quantitative analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0919", "title": "Fault Tolerance: Optimize Recovery Speed for Frequently Crashing Mobile Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should the job recover from thermal crashes in under 30 seconds and reduce future crashes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0920", "title": "Fault Tolerance: Optimize Checkpoint Storage Budget for Low-Storage Mobile Devices", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you reduce checkpoint storage below 50 MB while preserving fault tolerance on low-storage A17 Pro devices?", "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 2}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0921", "title": "Fault Tolerance: Realize Checkpoint Architecture for Continuous Mobile Personalization", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify storage layout, checkpoint schedule, retention policy, and recovery paths for an on-device training system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0922", "title": "Fault Tolerance: Realize Mobile Fleet Update Rollout Timeline", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify staging percentages, validation gates, CDN requirements, and rollback procedures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0923", "title": "Fault Tolerance: Recall — What is an A/B Model Partition?", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is an A/B model partition strategy, and why does it improve fault tolerance for on-device model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0924", "title": "Fault Tolerance: Specification — Reliability SLA for Mobile Payment ML Model", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What reliability SLA should govern an on-device payment fraud model, including RTO, RPO, fail-safe behavior, and checkpoints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0925", "title": "Kernel Fusion: Recall — What is Operator Fusion on Neural Engines?", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is operator fusion on the A17 Pro Neural Engine, and which mobile transformer operations are commonly fused?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0926", "title": "Kernel Fusion: Design Fusion Strategy for On-Device LLM on Snapdragon 8 Gen 3", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What fusion graph helps a 1B 4-bit GQA LLM on Snapdragon 8 Gen 3 reach 20 tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0927", "title": "Kernel Fusion: Evaluate Core ML vs ONNX Runtime Fusion for iPhone LLM", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate kernel fusion depth, tokenization throughput, first-token latency, and subsequent-token latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0928", "title": "Kernel Fusion: Evaluate Quantized Attention Fusion on A17 Pro", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do FP16 unfused attention and INT8 fused attention compare for memory, latency, and accuracy?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0929", "title": "Kernel Fusion: Fluency — Estimate Neural Engine Throughput for Fused MLP", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the estimated Neural Engine throughput for the fused MLP block, and which phase is the bottleneck?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0930", "title": "Kernel Fusion: Implement Fused Conv+BN+ReLU for Mobile CNN on Snapdragon", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify the fusion implementation, verify correctness, and quantify memory bandwidth savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0931", "title": "Kernel Fusion: Implement Fused Attention for On-Device LLM", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify tile sizes for flash attention given available SRAM, and compute memory savings vs standard attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0932", "title": "Kernel Fusion: Mastery — Optimize 3B LLM Decode Throughput on A17 Pro", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which optimizations move a 3B INT4 LLM decode on A17 Pro from 18 to 30 tokens/sec, and by how much?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 3}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0933", "title": "Kernel Fusion: Mastery — Fusion Strategy Across Heterogeneous Mobile Hardware", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a single unified model with hardware-adaptive fusion strategy that achieves 20 tokens/sec on all three platforms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0934", "title": "Kernel Fusion: Optimize SwiGLU Fusion for Mobile LLM on Tensor G3", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is an 8ms SwiGLU activation unrealistic, and what fusion fix should remove the overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0935", "title": "Kernel Fusion: Optimize Attention Fusion for Long-Context Mobile LLM", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you analyze feasibility and design an optimized attention fusion strategy for a 4096-token context on an 8GB device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0936", "title": "Kernel Fusion: Realize Fusion Plan for Production Mobile LLM Deployment", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify all fused op groups, their implementations, testing requirements, and expected latency for each platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0937", "title": "Kernel Fusion: Realize Bandwidth Savings from Fusing Transformer Block on Snapdragon", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much memory bandwidth does fusing post-attention operations save for a 1B decode block on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0938", "title": "Kernel Fusion: Recall — Arithmetic Intensity and Mobile Fusion Impact", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of a ReLU on a 4096-element FP16 tensor, and how much latency does kernel fusion save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0939", "title": "Kernel Fusion: Define Fusion Requirements for Cross-Platform SDK", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What fusion requirements and platform adaptation rules should a cross-platform mobile LLM SDK specify to reach 20 tokens per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0940", "title": "KV-Cache: Analyze KV Cache Memory Pressure on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why KV cache growth causes this degradation and compute the expected latency scaling?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0941", "title": "KV-Cache: Design KV Cache Management for On-Device LLM Chat App", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the KV cache management system for this application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0942", "title": "KV-Cache: Design Multi-Turn KV Cache Persistence for Mobile App Switching", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a KV cache persistence strategy that allows resuming conversation within 3 seconds of app foreground?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0943", "title": "KV-Cache: Diagnose KV Cache Overflow Causing Mobile App Crashes", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Tensor G3 3B LLM app crash near 3000 tokens, and how should the KV cache be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0944", "title": "KV-Cache: Diagnose KV Cache Thrashing in Concurrent Mobile LLM Requests", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does A17 Pro foreground chat slow 3x when a 1B background summarizer holds a 2048-token KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0945", "title": "KV-Cache: Diagnose Attention Head Selection Causing Memory Inefficiency", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the current memory inefficiency and quantify the GQA improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0946", "title": "KV-Cache: Evaluate KV Cache Quantization Strategies on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do FP16, INT8, and INT4 KV caches trade memory, bandwidth, quality, and complexity for a 1B LLM?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0947", "title": "KV-Cache: Evaluate Sliding Window vs Full KV Cache for Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate memory, quality, and latency over a 50-turn, 3000-token conversation for these two KV cache strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0948", "title": "KV-Cache: Evaluate GQA vs MHA for Memory-Constrained Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the memory, bandwidth, quality, and architecture impact of each choice?", "chain_ids": ["mobile-chain-bucket-kvcachem-06"], "chain_positions": {"mobile-chain-bucket-kvcachem-06": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0949", "title": "KV-Cache: Fluency — KV Cache Size Estimation for 1B Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a 4096-token conversation feasible in the available memory on this device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0950", "title": "KV-Cache: Fluency — Decode Throughput vs Context Length Trade-off on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does KV cache bandwidth degradation affect decode throughput across 256, 1024, and 4096 context tokens?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0951", "title": "KV-Cache: Fluency — GQA Memory Savings Calculation for Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do MHA, GQA, and MQA KV-cache sizes affect maximum context for a 3B INT4 LLM?", "chain_ids": ["mobile-chain-bucket-kvcachem-06"], "chain_positions": {"mobile-chain-bucket-kvcachem-06": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0952", "title": "KV-Cache: Implement KV Cache Size Limiter for Mobile LLM Memory Safety", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a 1B LLM cap KV cache at an 80% memory threshold while preserving key tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0953", "title": "KV-Cache: Mastery — KV Cache Co-Design for Mobile LLM Architecture", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you co-design the model architecture and KV cache management policy to meet the target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0954", "title": "KV-Cache: Mastery — Long-Context KV Cache Strategy for Mobile Code Assistant", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a memory-efficient KV cache, eviction policy, and quality preservation strategy for a 32K context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0955", "title": "KV-Cache: Optimize KV Cache for Aggressive Memory Compression on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should KV cache compression and offloading free memory for longer contexts on a 3B A17 Pro LLM?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 3}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0956", "title": "KV-Cache: Optimize KV Pruning for Token Budget on Snapdragon 8 Gen 3", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For an 8192-token legal document on a 2048-token KV budget, should Snapdragon 8 Gen 3 use stride pruning or H2O?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0957", "title": "KV-Cache: Realize KV Cache Memory Layout for 3B LLM on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify tensor dimensions, memory addresses, data types, and total allocation for the KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0958", "title": "KV-Cache: Realize KV Cache Reuse for System Prompt on Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you cache the system prompt KV, and what is the quantified TTFT improvement for a 512-token user message?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0959", "title": "KV-Cache: Recall — What is a KV Cache and Why Does It Matter for Mobile LLM?", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a KV cache, why does mobile LLM decoding need it, and how does generation compute scale without it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0960", "title": "KV-Cache: Specification — KV Cache Requirements for Production Mobile LLM App", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What KV cache memory, quantization, eviction, and recovery requirements should a production A17 Pro chat app specify?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0961", "title": "KV-Cache: Specification — KV Cache Budget for Multi-Turn Code Completion", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify the KV cache budget, quantization, layout, and context management for this code completion assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0962", "title": "KV-Cache: Specification — KV Cache Constraints for Latency SLA on Tensor G3", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What maximum KV cache size, required quantization, and context limit are necessary to meet the latency SLA given bandwidth constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0963", "title": "Latency Decomposition: Compare On-Device vs. Cloud LLM Latency on A17 Pro", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a 1B language model, which deployment approach wins TTFT and TPOT for 50 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0964", "title": "Latency Decomposition: Compare CoreML vs. TFLite Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare inference latency between Android NNAPI and iOS CoreML including framework overhead and memory transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0965", "title": "Latency Decomposition: Compute TTFT and TPOT for On-Device LLM on Snapdragon 8 Gen 3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you compute TTFT for a 256-token prompt and TPOT for decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0966", "title": "Latency Decomposition: Full Mobile ML App Latency Audit with User-Perceived Delay", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the tap-to-preview latency breakdown for the style transfer app, and which component should be optimized first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0967", "title": "Latency Decomposition: Optimize Wake-Word Detection Latency on Tensor G3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you cut wake-word P99 latency from 180 ms to under 80 ms for a 1M-param INT8 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0968", "title": "Latency Decomposition: Size Inference Latency Budget for AR App on A17 Pro", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What latency budget does each AR segmentation pipeline stage consume, and how much 60 FPS slack remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0969", "title": "MLOps Lifecycle: Compare App Store vs. Dynamic Model Updates for iOS ML Apps", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do App Store bundled models and dynamic post-install downloads compare regarding latency, compliance, and bandwidth cost for 10M users?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0970", "title": "MLOps Lifecycle: Compare Federated Learning vs. Centralized Training for Mobile", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you quantify the gradient upload bandwidth, privacy implications, and model accuracy tradeoffs between federated and centralized training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0971", "title": "MLOps Lifecycle: End-to-End Mobile ML Platform Design for Consumer App", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What storage, bandwidth, and engineering overhead come from 10 variants of a 30 MB photo model for 50M users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0972", "title": "MLOps Lifecycle: End-to-End Mobile MLOps with Personalization Pipeline", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the full lifecycle including on-device training, privacy-preserving aggregation, global model update, and OTA distribution to 5M devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0973", "title": "MLOps Lifecycle: Optimize Mobile App CI/CD Pipeline for ML Model Updates", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the most effective way to optimize the non-training stages of the CI/CD pipeline to drastically reduce the total time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0974", "title": "MLOps Lifecycle: Size OTA Update Bandwidth for 100M Mobile App Users", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What monthly OTA model update bandwidth and CDN cost result for 100M users, and how much can delta updates save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0975", "title": "MLOps Lifecycle: Size Model Registry Storage for Mobile Segment Variants", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much registry storage and monthly S3 cost are needed for the four mobile model tiers and retained versions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0976", "title": "MLOps Lifecycle: Specify A/B Test Framework for On-Device Model Comparison", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify an A/B testing framework for comparing these models, defining sample size, assignment, metrics, and decision criteria?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0977", "title": "Model Format Conversion: Compare TFLite vs. CoreML for Cross-Platform Mobile Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should TensorFlow MobileNetV3 deploy to iOS through TFLite-to-CoreML conversion or ONNX-to-CoreML?", "chain_ids": ["mobile-chain-auto-001-07"], "chain_positions": {"mobile-chain-auto-001-07": 0}, "chain_tiers": {"mobile-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0978", "title": "Model Format Conversion: Compare INT8 vs. INT4 CoreML Quantization on A17 Pro", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare model size, inference latency, and accuracy for a 50-token generation task?", "chain_ids": ["mobile-chain-auto-001-12"], "chain_positions": {"mobile-chain-auto-001-12": 1}, "chain_tiers": {"mobile-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0979", "title": "Model Format Conversion: Implement ONNX→CoreML Conversion with Numerical Validation", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What steps, validation metrics, expected size, and latency should be used when converting MobileNetV3-Small from ONNX to CoreML?", "chain_ids": ["mobile-chain-auto-001-11"], "chain_positions": {"mobile-chain-auto-001-11": 1}, "chain_tiers": {"mobile-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0980", "title": "Model Format Conversion: Full Stack LLM Conversion for On-Device iOS Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you evaluate conversion time, model size, TTFT, TPOT, and maximum context length for a 1B model in 8GB RAM?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 4}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0981", "title": "Model Format Conversion: End-to-End LLM CoreML Stateful Deployment Mastery", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the full multi-platform model format strategy for size and latency?", "chain_ids": ["mobile-chain-auto-001-07"], "chain_positions": {"mobile-chain-auto-001-07": 2}, "chain_tiers": {"mobile-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0982", "title": "Model Format Conversion: Optimize TFLite Model Conversion for Snapdragon DSP", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an EfficientNet-Lite0 TFLite model converted through the SDK show 45 ms latency instead of the 8 ms target?", "chain_ids": ["mobile-chain-auto-001-03"], "chain_positions": {"mobile-chain-auto-001-03": 1}, "chain_tiers": {"mobile-chain-auto-001-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0983", "title": "Model Format Conversion: Size ONNX vs. TFLite Model Storage for Mobile App", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How large are the bundled ONNX FP32 and TFLite INT8 model variants, and do they fit under the 200MB app limit?", "chain_ids": ["mobile-chain-auto-001-11"], "chain_positions": {"mobile-chain-auto-001-11": 0}, "chain_tiers": {"mobile-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0984", "title": "Model Format Conversion: Recall ONNX Opset Compatibility for Mobile Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the recommended ONNX opset version for converting PyTorch models to CoreML, and why does opset version matter?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 0}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0985", "title": "Model Format Conversion: Specify Multi-Platform Model Conversion CI/CD Requirements", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a production CI/CD pipeline for converting and validating PyTorch models for simultaneous iOS and Android deployment?", "chain_ids": ["mobile-chain-auto-001-07"], "chain_positions": {"mobile-chain-auto-001-07": 1}, "chain_tiers": {"mobile-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0986", "title": "Model Size Estimation: Analyze Why Mobile LLM Needs INT4 Despite 8GB RAM", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 3B parameter model require INT4 quantization to run on an 8GB iPhone 16 Pro?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0987", "title": "Model Size Estimation: Design On-Device ML Memory Architecture for Dual-Model iPhone App", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the app keep preview and high-quality portrait models in memory while meeting 60 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0988", "title": "Model Size Estimation: Design Memory-Efficient LLM Stack for Android Flagship", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a mobile device allocate precision, KV cache, and context length for a 3B LLM in 16 GB RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0989", "title": "Model Size Estimation: Compare 1B vs. 3B LLM Memory on A17 Pro", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the model size, KV-cache capacity at 4096 context, TPOT, and maximum number of conversation turns before context overflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0990", "title": "Model Size Estimation: Compare INT4 vs. INT8 Memory for Mobile LLM on Snapdragon", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare memory footprint, TPOT, maximum batch size, and accuracy implications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0991", "title": "Model Size Estimation: Fluency — Size Mobile LLM Memory in 60 Seconds", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much memory does a 1B INT4 model use on a 16 GB phone, and what remains for KV cache?", "chain_ids": ["mobile-chain-auto-027-12"], "chain_positions": {"mobile-chain-auto-027-12": 0}, "chain_tiers": {"mobile-chain-auto-027-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0992", "title": "Model Size Estimation: Implement Parameter Count for Custom Transformer on Mobile", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many parameters and INT8 bytes does a 6-layer 512-hidden transformer with a 32K vocab require, including embeddings, attention, and FFN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0993", "title": "Model Size Estimation: Master Full On-Device LLM Memory Audit for Production iOS App", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you verify the system stays under the 5GB practical ceiling?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 3}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0994", "title": "Model Size Estimation: Master Tradeoffs for Multi-Modal LLM on Mobile", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compute full memory layout and TPOT for text generation after image encoding?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0995", "title": "Model Size Estimation: Diagnose Mobile OOM from KV-Cache Growth", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of the OOM crashes and quantify a fix?", "chain_ids": ["mobile-chain-auto-027-12"], "chain_positions": {"mobile-chain-auto-027-12": 1}, "chain_tiers": {"mobile-chain-auto-027-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0996", "title": "Model Size Estimation: Diagnose Mobile Performance Degradation from Memory Pressure", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 7B INT4 LLM on Snapdragon 8 Gen 3 slow from 45ms to 180ms TPOT after 5 minutes, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0997", "title": "Model Size Estimation: Realize Full Memory Layout for 3B LLM on Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 3B INT4 LLM plus 4096-token KV cache fit on Snapdragon 8 Gen 3, and what TPOT follows from 77 GB/s?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 0}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0998", "title": "Model Size Estimation: Realize Memory Layout Comparison Across 3 Mobile Platforms", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do A17 Pro, Snapdragon 8 Gen 3, and Tensor G3 compare on memory budget, concurrent KV-cache users, and TPOT for a 1B INT4 LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0999", "title": "Model Size Estimation: Specify Maximum On-Device LLM for Target Memory Constraint", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive the maximum model family, precision, max parameter count, and expected accuracy to satisfy these constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1000", "title": "Network Bandwidth Bottlenecks: Analyze LPDDR5X Shared Bus for Mobile SoC Memory Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much of a 77 GB/s Snapdragon memory bus does a 3B INT4 LLM consume alongside 4K H.265 encoding, and how does it degrade TPOT?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 0}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1001", "title": "Analyze Thermal Throttling Impact on Mobile Bandwidth", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What does a TPOT jump from 7.4ms to 14ms reveal about A17 Pro bandwidth throttling for a 500 MB LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1002", "title": "Network Bandwidth Bottlenecks: Design Memory Bandwidth Architecture for Mobile Multi-Model Inference", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a scheduler prevent memory bandwidth saturation across concurrent speech, LLM, and depth tasks?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1003", "title": "Network Bandwidth Bottlenecks: Design Bandwidth-Efficient On-Device LLM Pipeline", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a bandwidth-efficient decode pipeline for a 3B INT4 LLM targeting TPOT < 30ms by leveraging weight tiling, prefetching, and KV-cache layouts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1004", "title": "Network Bandwidth Bottlenecks: Diagnose LPDDR5X Bandwidth Saturation on Snapdragon", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do LLM TPOT and camera inference both hit 60% throughput when memory bus use reaches saturation, and how do you fix it?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 1}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1005", "title": "Network Bandwidth Bottlenecks: Diagnose LLM TPOT Regression After App Update", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can a 60 FPS Metal animation explain a 1B INT4 LLM TPOT jump from 7.4ms to 11ms on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1006", "title": "Network Bandwidth Bottlenecks: Evaluate On-Device vs. Cloud for Bandwidth-Constrained Mobile Users", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do end-to-end latencies for on-device and cloud LLM inference compare across 5G and LTE network conditions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1007", "title": "Network Bandwidth Bottlenecks: Evaluate Speculative Decoding Bandwidth Impact on Mobile", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "On Snapdragon 8 Gen 3, does a 100M draft plus 3B INT4 target improve TPOT after bandwidth costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1008", "title": "Network Bandwidth Bottlenecks: Evaluate WiFi vs. Cellular Bandwidth for Mobile Cloud ML", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At what network RTT condition does the on-device model outperform the cloud model for a 128-token generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1009", "title": "Network Bandwidth Bottlenecks: Fluency — Mobile Bandwidth Math in 60 Seconds", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the decode-bound TPOT and what's the memory bandwidth utilization during a single-user chat session?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1010", "title": "Network Bandwidth Bottlenecks: Implement TPOT Formula and Arithmetic Intensity Analysis", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you compute arithmetic intensity, peak FLOPS-bound TPOT, peak BW-bound TPOT, and determine which bottleneck applies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1011", "title": "Network Bandwidth Bottlenecks: Master Full Bandwidth Analysis for Mobile AI System", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What bandwidth conflicts arise when these workloads run together, and what scheduler changes are needed?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 3}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1012", "title": "Network Bandwidth Bottlenecks: Master Roofline Analysis for Mobile LLM Scaling", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At what batch size does a 3B INT4 LLM transition from bandwidth-bound to compute-bound decode, and what does this mean for mobile serving?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 4}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1013", "title": "Network Bandwidth Bottlenecks: Optimize LLM TPOT via Weight Quantization on A17 Pro", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which quantization strategy gets a 3B FP16 LLM on A17 Pro from 88 ms TPOT to under 30 ms?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 3}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1014", "title": "Network Bandwidth Bottlenecks: Optimize Decode TPOT via Speculative Decoding on Snapdragon", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 200M draft model make a 7B INT4 LLM meet a sub-20ms TPOT target on Snapdragon 8 Gen 3, and how does it compare to reducing model size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1015", "title": "Network Bandwidth Bottlenecks: Optimize Mobile Camera Pipeline Memory Bandwidth", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What memory bandwidth components explain the 15GB/s camera pipeline load, and which optimizations reduce it below 8GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1016", "title": "Network Bandwidth Bottlenecks: Realize LPDDR5X Bandwidth Budget for Full Mobile AI Stack", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the LPDDR5X bandwidth consumption for a mobile AI stack, and does it fit under the 77GB/s Snapdragon limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1017", "title": "Realize Peak Bandwidth Utilization During On-Device LLM Response", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What peak and average LPDDR5X bandwidth does a 50-token 3B INT4 LLM response require on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1018", "title": "Network Bandwidth Bottlenecks: Specify Bandwidth SLA for Mobile LLM App Quality", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What bandwidth reservations, priorities, throttles, and fallbacks are needed to keep mobile LLM TPOT under 25ms P95?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1019", "title": "Network Bandwidth Bottlenecks: Specify QoS Framework for Multi-Model Mobile Inference", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a complete QoS framework to manage 77 GB/s bandwidth across 5 simultaneous ML models?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 2}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1020", "title": "Network Bandwidth Bottlenecks: Recall LPDDR5X Bandwidth and Mobile Memory Hierarchy", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the memory bandwidth of LPDDR5X on Snapdragon 8 Gen 3, how does it compare to LPDDR5 on Tensor G3, and why does this matter for LLM inference?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 0}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1021", "title": "Network Bandwidth Bottlenecks: Specification for Minimum Bandwidth to Support On-Device LLM", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive the memory bandwidth spec from first principles and explain how it constrains SoC design choices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1022", "title": "Model Format Conversion: Mobile Fluency — CoreML Model Conversion Pipeline in 60 Seconds", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the 4 steps to convert a PyTorch MobileNet model to CoreML, and what is the expected size of a 5M parameter model in FP16?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 2}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1023", "title": "Model Format Conversion: Optimize INT8 to INT4 Conversion for Android LLM", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose which quantization approach to use and quantify the accuracy-size tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1024", "title": "Model Format Conversion: Realize Multi-Format Model Storage for Cross-Platform iOS+Android", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the CDN delivery cost for 50M total installs of multi-format models, and how much does delta updating save?", "chain_ids": ["mobile-chain-auto-001-05"], "chain_positions": {"mobile-chain-auto-001-05": 2}, "chain_tiers": {"mobile-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1025", "title": "Network Bandwidth Bottlenecks: Design Offline-First Bandwidth Architecture for Rural Mobile LLM", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify: on-device model configuration, model download strategy, graceful degradation for different connectivity levels, and cloud fallback criteria?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1026", "title": "Network Bandwidth Bottlenecks: Diagnose Mobile LLM Latency Spike from Background App", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What would you identify as the root cause using bandwidth analysis and quantify the fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1027", "title": "Network Bandwidth Bottlenecks: Fluency — LPDDR5X BW and TPOT Math in 30 Seconds", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the TPOT and what fraction of the memory bus does it saturate during decode?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 1}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1028", "title": "Network Bandwidth Bottlenecks: Fluency — Speculative Decode BW Arithmetic", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the effective TPOT assuming an 80% acceptance rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1029", "title": "Network Bandwidth Bottlenecks: Implement Memory-Bandwidth-Bound TPOT Derivation", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you derive the memory-bandwidth-bound TPOT formula, and what is the latency for a 3B INT4 model on a device with 77 GB/s bandwidth?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 2}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1030", "title": "Network Bandwidth Bottlenecks: Implement KV-Cache Bandwidth Impact at Long Contexts", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you compute TPOT at context lengths of 512, 2048, 4096, and 8192 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1031", "title": "Network Bandwidth Bottlenecks: Implement Flash-Attention BW Reduction", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the memory bandwidth savings from using Flash-Attention vs. standard attention for this 3B LLM prefill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1032", "title": "Network Bandwidth Bottlenecks: Master Memory Bus Analysis for Multi-LLM Mobile App", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you analyze whether both can run simultaneously without bandwidth saturation, and specify a token-interleaved schedule?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1033", "title": "Mobile TCO Realization: Size Peak Bandwidth for Mobile LLM App", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For Snapdragon 8 Gen 3 at 77 GB/s, what are peak and sustained bandwidth for a 3B INT4 LLM over 10 turns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1034", "title": "Network Bandwidth Bottlenecks: Specification — Minimum BW for 10-Turn Conversation SLA", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive the worst-case bandwidth requirement and compare against LPDDR5 (51.2 GB/s) vs LPDDR5X (77 GB/s)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1035", "title": "Network Bandwidth Bottlenecks: Specification — Design Bandwidth QoS for 5G Hybrid Mobile Inference", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What routing rules should choose between cloud and on-device LLM inference under 5G latency, privacy, and budget constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1036", "title": "Network Bandwidth Bottlenecks: Specification — Bandwidth Requirements for Voice Assistant Pipeline", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the peak LPDDR5X bandwidth requirements for each stage of the A17 Pro voice assistant pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1037", "title": "Mobile TCO Analyze: A17 Pro vs Cloud for On-Device Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the annual battery degradation cost, cloud cost, and economic break-even point for on-device inference?", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1038", "title": "Mobile TCO Design: 8GB vs 16GB NPU Inference Economics", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which mobile platform has the better two-year TCO and capability profile for a 10,000-device enterprise LLM deployment?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1039", "title": "Mobile TCO Design: On-Device vs Hybrid Inference Cost for Consumer App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Calculate the monthly cloud cost, optimize the cutoff, and estimate annual savings from increasing the on-device fraction to 85%.", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1040", "title": "Mobile TCO Evaluation: A17 Pro vs Tensor G3 for On-Device ML App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate per-transcription energy cost for a 15-minute real-time audio transcription, and determine model compatibility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1041", "title": "Mobile TCO Evaluation: On-Device vs Cloud for Photo Enhancement App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For SD v1.5 generating 5 images/day, which is cheaper over two years: on-device or a $0.01 cloud API, and where is the break-even point?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1042", "title": "Mobile TCO Fluency: Quick Battery Cost Estimation for Mobile ML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What annual energy cost and battery-cycle impact do the mobile ML workloads create?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1043", "title": "Mobile TCO Implement: Calculate Cost Per Inference for Mobile App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the inference time, energy per inference, annual energy cost per user, and cost if monetizing electricity as a service (at $0.10/kWh markup)?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1044", "title": "Mobile TCO Mastery: Enterprise Mobile AI Strategy Full Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How does the 3-year TCO of an on-device enterprise deployment compare to a cloud API at $0.001/1K tokens?", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1045", "title": "Mobile TCO Mastery: On-Device AI ROI for Mobile App Monetization", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the 2-year NPV and ROI for building on-device LLM capabilities?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1046", "title": "Mobile TCO Optimization: Reduce Battery Drain for Intensive Mobile ML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you quantify the TCO impact for 1M daily active users in electricity terms while reducing battery drain?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 4}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1047", "title": "Mobile TCO Realization: Concrete Cost Model for Music AI App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does the 2-year TCO of an on-device MusicGen deployment compare to cloud generation at $0.05 per song?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1048", "title": "Mobile TCO Realization: Memory Cost of On-Device Model Hosting", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can these three models fit in the A17 Pro's 8GB RAM budget, and what is their effective storage cost on a 256GB ($1199) device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1049", "title": "Mobile TCO Recall: Mobile ML Hardware Cost Tiers", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What TOPS, RAM, power, and maximum INT4 and FP16 LLM sizes characterize modern mobile SoC platforms?", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1050", "title": "Mobile TCO Specification: Design Mobile-First AI Product Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify the on-device vs cloud split, annual revenue, cost breakdown, and gross margin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1051", "title": "Mobile Transformer Cost Evaluation: On-Device LLM Viability on A17 Pro", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can Llama-3-8B INT4 meet a P99 under 3 seconds for 100-token responses on the A17 Pro, given RAM and bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1052", "title": "Mobile Transformer Cost Evaluation: Snapdragon vs A17 Pro LLM Performance", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate decode throughput (tokens/second), latency for 200-token response, and which platform better utilizes its memory advantage for LLM decode?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 2}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1053", "title": "Mobile Transformer Cost Fluency: Quick LLM Sizing for Mobile", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the quick estimates for maximum INT4 LLM size, 3B decode speed, and 1024-token prefill cost on an 8GB device with 100GB/s bandwidth?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 1}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1054", "title": "Mobile Transformer Cost Implement: Calculate Token Budget for Mobile LLM", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate model memory, KV cache per token, max context tokens given RAM constraints, and verify the throughput SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1055", "title": "Mobile Transformer Cost Mastery: Full On-Device LLM Product Design", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you choose model size and quantization for each task and calculate memory allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1056", "title": "Mobile Transformer Cost Mastery: Mobile LLM Scaling Law Analysis", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What would you identify as the Pareto-optimal model for a voice assistant (SLO: P99 < 3s, quality: perplexity < 12)?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 4}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1057", "title": "Mobile Transformer Cost Optimization: Quantization + Speculative for Mobile LLM", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which optimization achieves P99 < 3s with minimum quality loss?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 3}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1058", "title": "Mobile Transformer Cost Optimization: On-Device vs Streaming Hybrid", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an optimal hybrid policy that minimizes cost while maintaining P99<3s across network conditions, and what is the cost per 1000 daily queries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1059", "title": "Mobile Transformer Cost Realization: Concrete Memory Layout for Mobile LLM", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is there room for a 7B INT4 target model alongside the draft model for speculative decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1060", "title": "Mobile Transformer Cost Recall: Mobile LLM Memory Bandwidth Facts", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the memory bandwidth for these SoCs, and what is the decode tokens/second for a 3B INT4 model on each?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 0}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1061", "title": "Mobile Transformer Cost Specification: Design Multi-Task Mobile LLM System", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which model architecture and memory allocation allows keyword spotting, chat, and summarization to meet their SLOs simultaneously?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1062", "title": "Diagnose Thermal Throttling Cost Impact on A17 Pro Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the cost impact of throttling on user experience and propose thermal-aware scheduling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1063", "title": "Analyze Transformer Token Budget on Snapdragon 8 Gen 3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory allocation for supporting a 4K context window with 10 concurrent conversation turns, given 32 layers, 8 KV heads, and a 64 head dimension?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1064", "title": "Design On-Device LLM Architecture for Tensor G3 Within Thermal Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What LLM architecture fits under the 3W power, 500ms TTFT, and 50ms/token decode constraints on the Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1065", "title": "Design Prompt Caching Strategy for Mobile LLM on A17 Pro", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What memory cost and TTFT savings come from caching a 512-token system-prompt KV cache across requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1066", "title": "Diagnose Attention Softmax Precision Loss on Mobile NPU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do INT8 attention logits on a mobile NPU degrade the LLM from 89% at 128 tokens to 61% at 512 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1067", "title": "Implement Mobile LLM Decode Benchmark on Tensor G3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a decode benchmark isolate inference time and compare measured tokens per second to the bandwidth limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1068", "title": "Realize Mobile LLM Realization: Llama 3.2 1B Deployment on A17 Pro", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the correct KV-cache size for the Llama 3.2 1B deployment, and why is the larger estimate wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1070", "title": "Speculative Decoding Feasibility on Mobile NPU", "topic": "speculative-decoding", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does speculative decoding impact on-device latency given the NPU cannot run two models concurrently and must swap them in shared memory?", "chain_ids": ["mobile-chain-auto-secondary-017-33"], "chain_positions": {"mobile-chain-auto-secondary-017-33": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1071", "title": "Speculative Decoding with On-Device LoRA Draft", "topic": "speculative-decoding", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a shared-weight speculation system and analyze its memory savings compared to a separate 0.5B draft?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1072", "title": "On-Device KV-Cache Budget for Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache context length fits a 3B INT4 LLM on A17 Pro when only 2GB remains for KV storage?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 3}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1073", "title": "KV-Cache Persistence Across App Sessions on Mobile", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a KV-cache persistence strategy that avoids re-prefilling tokens when the app is relaunched?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1074", "title": "KV-Cache Quantization for On-Device LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare FP16 vs INT8 vs INT4 KV-cache strategies for maximum concurrent conversations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1075", "title": "KV-Cache Impact on Mobile Thermal Throttling", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much of a 3B LLM slowdown from 15 tok/s at token 200 to 6 tok/s at token 800 is KV growth versus heat?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1076", "title": "KV-Cache Layout for NPU Acceleration", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the optimal memory layout for the ANE, and why does innermost dimension alignment matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1077", "title": "Shadow Deployment Frame Drops on Exynos NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does shadow deployment on the device drop camera frame rate despite low TOPS utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1078", "title": "Tensor G3 CPU vs TPU Decoding Bottlenecks", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the highly efficient TPU not provide a proportional speedup for decoding, and what is its theoretical compute limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1079", "title": "Adversarial Sparsity Loss on Exynos 2400 NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do adversarial camera inputs cause cross-component interference on Exynos 2400 despite fixed model FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1080", "title": "Analyzing GQA vs MHA Memory Bandwidth on Hexagon NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this architectural change provide such a significant speedup on this specific hardware?", "chain_ids": ["mobile-chain-auto-secondary-010-10"], "chain_positions": {"mobile-chain-auto-secondary-010-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1081", "title": "Depthwise Convolutions on Tensor G3 TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you analyze why the latency reduction does not scale linearly with the FLOP reduction on this specific hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1082", "title": "Mobile NPU Inference Cost Analysis on Snapdragon 8 Gen 3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Snapdragon 8 Gen 3 theoretical 45 TOPS metric not perfectly translate to the observed frame rate?", "chain_ids": ["mobile-chain-auto-secondary-004-08"], "chain_positions": {"mobile-chain-auto-secondary-004-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1083", "title": "On-Device Coreset Selection and NPU Memory Contention", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does increasing an Exynos 2400 coreset from 500 to 2000 samples drop NPU utilization below 5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1084", "title": "A17 Pro Unified Memory Contention in Video Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does this data pipeline bottleneck the system despite the available compute overhead?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 1}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1085", "title": "On-Device Data Quality Bandwidth Contention", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does enabling a Tensor G3 anomaly-detection gate cut Gemini Nano token generation despite only 10% TPU use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1086", "title": "Dataset Padding Impact on NPU Utilization", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does feeding a 3840x3840 padded image degrade the NPU's effective TOPS and shift it to a memory-bound state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1087", "title": "DP-SGD Memory Wall on A17 Pro", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does increasing the DP-SGD batch size from 32 to 256 cause memory failure and thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1088", "title": "On-Device PSI Drift Compute Bottleneck", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you analyze why this specific statistical detection workflow degrades system reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1089", "title": "Analyze Encoder vs Decoder Latency on Snapdragon NPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 1B encoder run faster per token than a 1B decoder on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1090", "title": "Energy Analysis of Memory Access in LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does battery drain come primarily from memory weight reads during Gemini Nano token generation instead of TPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1091", "title": "Analyzing 3-bit AWQ Overhead on Tensor G3", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does 3-bit AWQ quantization cause a severe compute bottleneck on the mobile TPU architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1092", "title": "NPU Frame Dropping Impact on Equalized Odds", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this hardware-driven throttling degrade the Equalized Odds fairness metric specifically for Subgroup A, and what is the True Positive Rate calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1093", "title": "Federated Training NPU Underutilization", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is this massive NPU compute resource severely underutilized during the local training phase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1094", "title": "Hexagon NPU Thermal Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the system switch to a less efficient CPU fallback instead of running the primary model at 15 FPS on the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1095", "title": "TPU Graph Operator Fusion Performance Anomaly", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can LayerNorm+MatMul fusion run slower than unfused ops despite reducing LPDDR memory traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1096", "title": "Distilled vs Pruned Memory Bandwidth", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does Model B cause stuttering and what is the difference in memory bandwidth consumption?", "chain_ids": ["mobile-chain-auto-secondary-014-21"], "chain_positions": {"mobile-chain-auto-secondary-014-21": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1097", "title": "Exynos 2400 Shared Memory Exhaustion", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this specific performance cliff happen at this exact context length based on the hardware specs?", "chain_ids": ["mobile-chain-bucket-kvcachem-02"], "chain_positions": {"mobile-chain-bucket-kvcachem-02": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1098", "title": "Analyzing TTFT vs TPOT Bottlenecks", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the end-to-end latency of an on-device LLM become heavily bottlenecked during the decode phase (TPOT) rather than the prefill phase (TTFT)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1099", "title": "Hexagon NPU Memory Bandwidth Bottleneck Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does batch-size-1 INT8 sequence inference on the Hexagon NPU achieve only about 1.5 TOPS despite a 45 TOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1100", "title": "NPU Cold Start Latency with Memory-Mapped Weights", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming a 4KB page size and 4GB/s storage read speed, why does the NPU experience this first-run latency spike and what is the overhead?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1101", "title": "OOM Crash on A17 Pro During High-Res Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the OS kill the process during high-res inference, and how does the unified memory architecture contribute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1102", "title": "Mixed-Precision Bandwidth Contention on Exynos 2400", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does a 50/50 FP16-FP32 model suffer extra latency under Exynos ISP load, and how much extra bandwidth does a 10M parameter layer add?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1103", "title": "CI/CD to On-Device Performance Gap", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is a 3B INT8 model compute-bound in batched CI but only 1.6% utilized at batch size 1 on Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1104", "title": "CoreML Fallback Penalty on Apple A17 Pro", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does a 10% CPU fallback disproportionately degrade latency despite unified memory, and what is the new system throughput?", "chain_ids": ["mobile-chain-auto-001-02"], "chain_positions": {"mobile-chain-auto-001-02": 0}, "chain_tiers": {"mobile-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1105", "title": "Analyzing OOM for 3B FP16 Model on A17 Pro", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 6 GB FP16 3B model OOM on A17 Pro even though the phone has 8 GB unified memory?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 0}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1106", "title": "Investigating Tail Latency Spikes Under Memory Pressure", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the TPU utilization drop during memory pressure events, and what is the expected tail latency floor assuming a flash storage read speed of 2.5 GB/s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1107", "title": "Dual-Core NPU Scheduling Bottleneck Analysis", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why this scheduling choice degrades performance on this specific architecture?", "chain_ids": ["mobile-chain-auto-secondary-012-07"], "chain_positions": {"mobile-chain-auto-secondary-012-07": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1108", "title": "Thermal Throttling on A17 Pro Neural Engine", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this sustained workload exceed the power budget and trigger thermal throttling on the shared unified memory architecture?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1109", "title": "A17 Pro Neural Engine Memory Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the Neural Engine underutilizing its 35 TOPS compute capacity, and what is the system bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1110", "title": "Unstructured Sparsity Inefficiency on Apple A17 Pro", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does unstructured sparsity fail to yield hardware performance gains, and what is the expected compute-bound execution time?", "chain_ids": ["mobile-chain-auto-secondary-006-31"], "chain_positions": {"mobile-chain-auto-secondary-006-31": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1111", "title": "INT8 Quantization Memory Bandwidth Analysis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ double throughput from 15 FPS to 30 FPS while compute utilization stays at 25%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1112", "title": "Jank Analysis of On-Device LLM on Tensor G3", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do frame drops and ANR warnings occur when Gemini Nano runs on Tensor G3's TPU during 60 FPS scrolling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1113", "title": "Guardrail Latency on Tensor G3 TPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a Tensor G3 guardrail miss a 100ms SLA while Gemini Nano generates a 500-token response?", "chain_ids": ["mobile-chain-auto-secondary-011-15"], "chain_positions": {"mobile-chain-auto-secondary-011-15": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1114", "title": "Diagnosing A17 Pro Neural Engine Utilization", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this image enhancement model reach only 10% of the A17 Pro Neural Engine's peak utilization at batch size 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1115", "title": "A17 Pro Unified Memory Contention Analysis", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this non-deterministic execution occur and what is the constrained memory bandwidth under contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1116", "title": "Continuous Video Feature Ingestion Power Analysis", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much power does continuous 60 FPS feature ingestion consume, and why does a concurrent game exceed the thermal budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1117", "title": "Thermal Throttling in Shared NPU/ISP Pipelines", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does peak Exynos 2400 NPU use throttle in a camera pipeline, and what sustained TOPS fit the remaining thermal headroom?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1118", "title": "KV-Cache OOM on A17 Pro Unified Memory", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 3B 4-bit transformer with 4000-token context OOM under a 2.5 GB background-app limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1119", "title": "Shadow Deployment Architecture for On-Device LLM", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro shadow deployment run a new translation model safely without disrupting the active model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1120", "title": "On-Device LLM Guardrails against Prompt Injection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a defense-in-depth strategy that robustly filters adversarial inputs without violating memory constraints or battery limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1121", "title": "Design On-Device Attention for Gemini Nano on Tensor G3", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an attention architecture that optimizes for the G3's memory constraints and TPU capabilities while maintaining acceptable summarization quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1122", "title": "Architecting a Real-Time Video Segmentation Model for Exynos NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs of using larger feature maps early in the network versus aggressive downsampling given the shared memory bandwidth constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1123", "title": "Architecting On-Device LLM Workloads for Tensor G3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect an on-device intelligence pipeline for a messaging app running exclusively on a mobile TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1124", "title": "On-Device Coreset Selection Architecture", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the tradeoffs between heuristic filtering and model-based data selection to maximize the Information-Compute Ratio (ICR) without causing memory thrashing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1125", "title": "Real-time Video Quality Gating on Exynos 2400", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Where should the quality validation run, and how does early frame rejection balance power against accuracy?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1126", "title": "On-Device Active Learning Data Curation for Gemini Nano", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect the data selection criteria and annotation workflow to filter high-value samples while minimizing battery impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1127", "title": "On-Device Drift Detection for Exynos 2400", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 camera pipeline detect drift on device without exporting raw frames or starving ISP/NPU bandwidth?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1128", "title": "On-Device Translation Architecture on Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is a specialized encoder-decoder architecture preferable to a unified decoder-only LLM for Tensor G3 live translation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1129", "title": "Always-On Vision Architecture for Exynos 2400", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an always-on continuous object tracking architecture for the Samsung Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1130", "title": "Architecting a Sub-4-bit LLM Pipeline for Samsung Exynos 2400", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the design choices between techniques like GPTQ and AWQ, considering the NPU's hardware execution profile?", "chain_ids": ["mobile-chain-auto-secondary-013-15"], "chain_positions": {"mobile-chain-auto-secondary-013-15": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1131", "title": "On-Device Intersectional Fairness Telemetry Pipeline", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the evaluation pipeline, detailing how you schedule the evaluation models, manage memory contention, and aggregate subgroup metrics securely?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1132", "title": "On-Device LLM Degradation Ladder for Tensor G3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ensure the translation service remains fail-operational without degrading the overall OS experience under severe memory or thermal pressure?", "chain_ids": ["mobile-chain-auto-secondary-012-04"], "chain_positions": {"mobile-chain-auto-secondary-012-04": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1133", "title": "Designing a Distillation Pipeline for A17 Pro NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a distilled LLM student be trained and sized to fit mobile memory, latency, and power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1134", "title": "Paged KV-Cache Architecture on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile KV-cache manager support 8K contexts without triggering OOM or latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1135", "title": "Architecting Memory-Mapped Inference for Gemini Nano on Tensor G3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a memory-mapped weight loading strategy that guarantees zero cold-start delay for these apps while respecting the shared memory pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1136", "title": "Architecting Memory Management for On-Device LLMs", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a memory management strategy to handle the KV cache and weights without triggering OS-level OOM eviction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1137", "title": "Mixed-Precision Inference Architecture on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a 60 FPS A17 Pro segmentation model partition INT8 and FP16 work to balance speed, bandwidth, and accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1138", "title": "Architecting On-Device CI/CD for Exynos 2400 NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a CI/CD pipeline validate quantized on-device latency, accuracy, and shared-memory limits?", "chain_ids": ["mobile-chain-auto-secondary-006-19"], "chain_positions": {"mobile-chain-auto-secondary-006-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1139", "title": "Designing Graph Delegation for Snapdragon 8 Gen 3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a conversion and delegation strategy that safely evaluates tradeoffs between INT8 quantization, operator fallback penalties, and heterogeneous compute allocation?", "chain_ids": ["mobile-chain-auto-001-03"], "chain_positions": {"mobile-chain-auto-001-03": 2}, "chain_tiers": {"mobile-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1140", "title": "On-Device LLM Memory Architecture for Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the memory allocation, quantization strategy, and execution pipeline to safely deploy this model while maximizing the 45 TOPS INT8 compute available on the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1141", "title": "Exynos 2400 NPU Telemetry and Straggler Detection System", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should telemetry detect thermal throttling and memory stragglers with low reporting overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1142", "title": "Hardware-Aware NAS Design for A17 Pro Neural Engine", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should hardware-aware NAS target A17 Pro video super-resolution while respecting 60 FPS and memory-bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1143", "title": "Cross-Core Operator Scheduling for A17 Pro", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro operator scheduler maximize Neural Engine utilization without unified-memory thrashing or throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1144", "title": "Architecting Heterogeneous Profiling for NPU Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a Snapdragon 8 Gen 3 profiling system distinguish NPU compute limits, sync overhead, and memory contention?", "chain_ids": ["mobile-chain-auto-secondary-009-04"], "chain_positions": {"mobile-chain-auto-secondary-009-04": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1145", "title": "Architecting a Pruning Strategy for Hexagon NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect the end-to-end pruning strategy, detailing your choice of sparsity patterns and how you would align them with the Hexagon architecture to guarantee the speedup?", "chain_ids": ["mobile-chain-auto-secondary-006-32"], "chain_positions": {"mobile-chain-auto-secondary-006-32": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1146", "title": "Architecting On-Device Guardrails for Generative Text", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the system concurrently run a 3B generator and safety classifier without breaking latency or safety?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1147", "title": "Architecting a Fail-Safe ADAS Monitor on Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a Hexagon NPU dashcam pipeline detect NPU hangs or memory faults and enter a safe state within a 100ms fault-tolerant time interval?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1148", "title": "Real-Time Sensor Fusion Architecture on Snapdragon 8 Gen 3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an always-on Snapdragon 8 Gen 3 sensor-fusion pipeline ingest data and run inference without waking the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1149", "title": "Architecting Thermal-Aware Sustained Video Processing on A17 Pro", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile video super-resolution pipeline sustain 30 FPS under a strict continuous thermal envelope?", "chain_ids": ["mobile-chain-auto-secondary-013-23"], "chain_positions": {"mobile-chain-auto-secondary-013-23": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1150", "title": "LLM Memory Allocation on Snapdragon 8 Gen 3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a memory budgeting strategy for an on-device conversational AI on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1151", "title": "Shadow Deployment LMK Evictions on Tensor G3", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the Tensor G3 features 12 GB of LPDDR5X RAM and a 7.5 TOPS TPU, what hardware-level interaction is causing this shadow deployment to fail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1152", "title": "Diagnosing High Latency in On-Device Gemini Nano Sanitization", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the interaction between the models and the hardware to identify the root cause of the latency?", "chain_ids": ["mobile-chain-auto-secondary-009-01"], "chain_positions": {"mobile-chain-auto-secondary-009-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1153", "title": "Diagnosing Low NPU Utilization in MobileNet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of the severe NPU underutilization and high latency in MobileNetV2 depthwise convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1154", "title": "On-Device Fine-Tuning Battery Drain and Model Collapse", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this system failure and propose a data-centric solution?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1155", "title": "NPU Fallback from Data Contract Violations", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do FP32 preprocessing violations push an INT8 model from 2 ms to over 60 ms?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1156", "title": "A17 Pro NPU Dataset Bias in Low-Light Portrait Segmentation", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this discrepancy between validation metrics and real-world performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1157", "title": "Diagnosing Drift Detection Latency Spikes on Hexagon NPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why the PSI calculation is causing system-wide jank on this heterogeneous architecture?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1158", "title": "Low Compute Utilization During On-Device Decoding", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the A17 Pro's 35 TOPS capability and 8 GB of shared unified memory, what architectural characteristic of the encoder-decoder model causes this specific hardware symptom?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1159", "title": "Diagnosing LLM Latency Bias on Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this disparity, and how do the hardware specifications explain the symptom?", "chain_ids": ["mobile-chain-auto-secondary-013-16"], "chain_positions": {"mobile-chain-auto-secondary-013-16": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1160", "title": "Neural Engine OOM During Model Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does graceful degradation from a heavy to lightweight A17 Pro model trigger OOM, and how should it be fixed?", "chain_ids": ["mobile-chain-auto-secondary-012-03"], "chain_positions": {"mobile-chain-auto-secondary-012-03": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1161", "title": "Diagnosing High Latency in Feature-Distilled Models", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What architectural trait inherited from feature distillation causes memory bandwidth saturation on a mobile NPU?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1162", "title": "Diagnosing OOM during long-context Gemini Nano inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely cause of this crash, and how would you diagnose the memory pressure during this long-context generation?", "chain_ids": ["mobile-chain-bucket-kvcachem-03"], "chain_positions": {"mobile-chain-bucket-kvcachem-03": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1163", "title": "Camera App Transition OOM on Exynos 2400", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does switching between the camera app and an Exynos 2400 segmentation app cause OOM despite the device having 12 GB of shared LPDDR5X memory?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1164", "title": "Diagnosing NaN Outputs in On-Device LLM", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a 1.8B INT8-FP16 LLM on Tensor G3 produce NaNs on prompts with large attention scores?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1165", "title": "CI/CD Deployment Fallback Regression", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What training-serving gap caused the Hexagon NPU canary latency regression from 5ms to over 80ms?", "chain_ids": ["mobile-chain-auto-secondary-006-21"], "chain_positions": {"mobile-chain-auto-secondary-006-21": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1166", "title": "Diagnosing Delegation Fallback on Exynos 2400", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is an INT8 TFLite vision model 5x slower with low NPU utilization and saturated LPDDR5X?", "chain_ids": ["mobile-chain-auto-001-04"], "chain_positions": {"mobile-chain-auto-001-04": 0}, "chain_tiers": {"mobile-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1167", "title": "Diagnosing OOM on Exynos 2400 Shared Memory", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 3B INT8 conversational model OOM on Exynos 2400 during camera viewfinder despite 12 GB shared memory?", "chain_ids": ["mobile-chain-auto-027-11"], "chain_positions": {"mobile-chain-auto-027-11": 0}, "chain_tiers": {"mobile-chain-auto-027-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1168", "title": "Diagnosing Shared Memory Contention on Hexagon NPU", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do NPU p99 latency stragglers align with heavy GPU UI animations, and how should they be diagnosed?", "chain_ids": ["mobile-chain-auto-secondary-006-23"], "chain_positions": {"mobile-chain-auto-secondary-006-23": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1169", "title": "Diagnosing TPU Utilization Drop in Gemini Nano Decoding", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What operator scheduling or graph execution issue is most likely causing this symptom?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1170", "title": "Diagnosing Latency Spikes on Exynos 2400", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the likely bottleneck causing these intermittent latency spikes, and how would you verify it using profiling tools?", "chain_ids": ["mobile-chain-auto-secondary-009-05"], "chain_positions": {"mobile-chain-auto-secondary-009-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1171", "title": "Guardrail CPU Fallback on Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does invoking a 100M FP32 toxicity guardrail max the CPU and stall a Snapdragon 8 Gen 3 LLM app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1172", "title": "Watchdog Resets from NPU Memory Contention", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware architecture bottleneck causes these sudden latency spikes and watchdog resets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1173", "title": "Diagnosing TPU Thermal Throttling on Tensor G3", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does on-device LLM summarization slow down 3x after 5 minutes of continuous inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1174", "title": "CNN Architecture Choice for Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architectural design patterns are most optimal for maximizing utilization on this specific NPU, and what tradeoffs must you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1175", "title": "Evaluating On-Device Data Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate these alternatives, considering the shared memory architecture, and recommend the better design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1176", "title": "Shadow Deployment vs Canary for Snapdragon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which strategy is better suited for this hardware, shadow deployment or a progressive canary rollout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1177", "title": "Real-Time Video Ingestion Compute Utilization", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What compute utilization does a 60 FPS video ingestion pipeline require for 50 GOPs per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1178", "title": "Adversarial Randomized Smoothing Latency on A17 Pro", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What robust FPS and energy per prediction result from 10-pass randomized smoothing on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1179", "title": "On-Device Vision Pipeline Memory Bandwidth Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the total continuous memory bandwidth (in MB/s) consumed by this exact data pipeline sequence to keep the NPU fed.", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 1}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1180", "title": "Calculate Energy Cost of Memory vs Compute on Hexagon NPU", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy does the Hexagon NPU layer spend on INT8 compute versus LPDDR5X memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1181", "title": "Dynamic KV-Cache Paging for A17 Pro Unified Memory", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an iOS LLM engine manage KV-cache pages on the Apple A17 Pro's unified memory to avoid OOM while preserving Neural Engine utilization?", "chain_ids": ["mobile-chain-bucket-kvcachem-05"], "chain_positions": {"mobile-chain-bucket-kvcachem-05": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1182", "title": "Optimizing Intersectional Fairness Evaluation on Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the bottleneck and optimize this pipeline using the G3s hardware to quantify demographic parity and equalized odds efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1183", "title": "A17 Pro Mixed-Precision Bottleneck", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you identify the core bottleneck, and what speedup is expected by transitioning to an INT8 weight-only format?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1184", "title": "A17 Pro Neural Engine Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the A17 Pro voice assistant latency be decomposed for a 50-token prompt and 100-token response?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 4}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1185", "title": "Designing KV-Cache for Exynos 2400 NPU Shared Memory", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Exynos 2400 manage a 4096-token KV cache for a 7B LLM without unified-memory OOM kills?", "chain_ids": ["mobile-chain-bucket-kvcachem-02"], "chain_positions": {"mobile-chain-bucket-kvcachem-02": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1186", "title": "Exynos 2400 NPU Real-Time Power Budgeting", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify a power management strategy and deployment architecture that ensures sustained 30 FPS performance without exceeding the 2.5W cap?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1187", "title": "On-Device Guardrail Pipeline for Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a mobile chat app perform low-latency on-device PII and toxicity guardrails?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1188", "title": "Analyzing Shadow Deployment OOM on Exynos 2400", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does shadowing a 4GB language model beside a 2GB vision model OOM on Exynos 2400 during camera use?", "chain_ids": ["mobile-chain-auto-secondary-011-11"], "chain_positions": {"mobile-chain-auto-secondary-011-11": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1189", "title": "MHA vs GQA Decoding Bottlenecks", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the NPU sit idle despite having 45 TOPS available, and why does Grouped-Query Attention (GQA) significantly increase the token generation rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1190", "title": "INT8 Calibration Dataset Bias", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does calibrating INT8 detection on the first 1000 video frames cause a 40% nighttime accuracy drop?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 0}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1191", "title": "Encoder-Decoder Memory Bandwidth Advantage", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 1B/1B encoder-decoder translation model decode faster than a 2B decoder-only model on Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-013-08"], "chain_positions": {"mobile-chain-auto-secondary-013-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1192", "title": "Energy Cost of Memory vs Compute on Tensor G3", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a larger INT8 model that fits in SRAM use less energy than a smaller FP16 model fetching DRAM?", "chain_ids": ["mobile-chain-auto-secondary-013-13"], "chain_positions": {"mobile-chain-auto-secondary-013-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1193", "title": "3-bit vs 4-bit Unpacking Overhead on Tensor G3", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Explain why the smaller 3-bit model exhibits worse latency than the larger 4-bit model.", "chain_ids": ["mobile-chain-auto-secondary-013-14"], "chain_positions": {"mobile-chain-auto-secondary-013-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1194", "title": "Quantization Bias Under Memory Contention", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does falling back to static quantization disproportionately impact equalized odds for the minority demographic?", "chain_ids": ["mobile-chain-auto-secondary-013-18"], "chain_positions": {"mobile-chain-auto-secondary-013-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-18": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1195", "title": "FedAvg Memory Bottleneck on Hexagon NPU", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a FedAvg local epoch take 2.5 seconds instead of under 50ms despite high TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1196", "title": "Analyzing Hexagon NPU Fallback Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why should a throttled video pipeline fall back to a CPU model instead of a smaller NPU model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1197", "title": "TPU Kernel Fusion Limits", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the hardware constraints driving this compiler behavior and calculate the theoretical compute time for the linear layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1198", "title": "Distilled Model Memory Contention", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does latency spike during a live feed, and what is the effective memory bandwidth for loading the 25MB model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1199", "title": "Exynos Shared Memory KV-Cache Eviction", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Exynos 2400 transcription app hit OOM near 4,000 tokens only when the camera is active?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1200", "title": "Gemini Nano TTFT Decomposition on Tensor G3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What components make up the 600ms TTFT on the NPU, and which hardware limit dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1201", "title": "Analyzing Mmap Latency Spikes on Snapdragon 8 Gen 3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 500ms latency spike occur during the initial generation phase for a memory-mapped 2GB INT8 LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1202", "title": "Analyzing iOS Jetsam Eviction on Unified Memory", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this memory pressure eviction occur despite the theoretical availability within the 8 GB limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1203", "title": "FP16 vs INT8 Memory Bandwidth Contention on Exynos NPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the FP16 model suffer from high latency and low utilization despite the NPU having ample compute headroom?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1204", "title": "CI/CD to Tensor G3 Deployment Discrepancy", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did a Tensor G3 deployment pass emulated CI/CD tests but crash with OOM and latency spikes in production?", "chain_ids": ["mobile-chain-auto-secondary-006-18"], "chain_positions": {"mobile-chain-auto-secondary-006-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1205", "title": "CoreML ANE Fallback Latency Analysis", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this specific operator gap cause a massive latency degradation despite the fast unified memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1206", "title": "A17 Pro OOM Analysis with KV Cache Scaling", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Analyze why the dynamic inference memory footprint exceeds the limit during this specific generation task.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1207", "title": "Tensor G3 LLM Straggler Latency Analysis", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why this performance degradation occurs and explain the system behavior using the provided hardware specifications?", "chain_ids": ["mobile-chain-auto-secondary-006-22"], "chain_positions": {"mobile-chain-auto-secondary-006-22": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1208", "title": "NAS Preference for Compute-Bound Layers on Exynos", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the NAS optimizer heavily penalize lower-FLOP depthwise architectures on this specific SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1209", "title": "Dual-Core NPU Operator Scheduling on Exynos 2400", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does parallel operator scheduling on a multi-core mobile NPU counter-intuitively increase latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1210", "title": "Always-On Inference Power Discrepancy", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an always-on model needing only 0.35 TOPS add 1.2W of system power?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1211", "title": "Analyzing ANE Memory Bottlenecks in Transformers", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 500M INT8 language model take 10 ms/token despite a 35 TOPS Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1212", "title": "Unstructured Pruning Performance Regression on A17 Pro", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does 75% unstructured sparsity cause latency regression and high power draw on a mobile Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1213", "title": "Per-Channel INT8 Latency on Exynos 2400", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why changing the quantization granularity causes this latency regression on this specific hardware architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1214", "title": "Jank Analysis in Tensor G3 Camera Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 33.3ms frame budget being missed, considering the shared memory architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1215", "title": "Guardrail Latency on Tensor G3", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 2.5B INT8 Tensor G3 guardrail model have a 50 ms latency floor before compute starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1216", "title": "A17 Pro Neural Engine Roofline Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the vision transformer reach only 7 TOPS, or 20% of peak, under roofline analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1217", "title": "Watchdog Resets on A17 Pro Neural Engine", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an A17 Pro driver-monitoring model miss a 15 ms watchdog heartbeat when CPU and GPU navigation load is high?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1218", "title": "A17 Pro Shared Memory Streaming Bottleneck", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does CPU-copying 24 MB 60Hz frames make a video pipeline miss its 16.6 ms budget despite ample NPU compute?", "chain_ids": ["mobile-chain-auto-secondary-012-11"], "chain_positions": {"mobile-chain-auto-secondary-012-11": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1219", "title": "Analyzing Sustained vs Burst Performance on Exynos 2400", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this performance degradation occur, and what is the theoretical sustained TOPS required to maintain the degraded framerate?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1220", "title": "Analyzing KV-Cache Constraints on A17 Pro", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the shared 8GB unified memory architecture become the primary bottleneck as sequence length increases?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1221", "title": "A17 Pro Unified Memory KV-Cache OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 4096-token prompt OOM a 7B INT4 model even though the weights fit in memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1222", "title": "Progressive Rollout Design for A17 Pro Neural Engine", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro progressive rollout test a thermal-heavy video model without exhausting unified memory?", "chain_ids": ["mobile-chain-auto-secondary-011-12"], "chain_positions": {"mobile-chain-auto-secondary-011-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1223", "title": "Architecting On-Device LLM Defenses on Apple A17 Pro", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an on-device email summarizer defend against prompt injection using guardrails within memory and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1224", "title": "Architecting Long-Context Attention on Tensor G3", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Tensor G3 long-context summarization combine GQA, attention sinks, and sliding windows to fit 8K contexts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1225", "title": "Architecting an Efficient CNN for Exynos 2400 NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 object-detection CNN backbone reduce memory bandwidth while using the NPU effectively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1226", "title": "Architecting On-Device LLM Cost Strategy for Tensor G3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile deployment architect a 1.8B summarization model with INT4/INT8 tradeoffs and thermal fallback?", "chain_ids": ["mobile-chain-auto-secondary-004-07"], "chain_positions": {"mobile-chain-auto-secondary-004-07": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1227", "title": "On-Device Coreset Selection Architecture for Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect a system that evaluates the Information-Compute Ratio (ICR) of incoming data and selects a high-value coreset for on-device training without starving OS resources?", "chain_ids": ["mobile-chain-auto-secondary-014-17"], "chain_positions": {"mobile-chain-auto-secondary-014-17": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1228", "title": "Zero-Copy Data Pipeline for Snapdragon NPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a 4K60 vision pipeline use zero-copy buffers to feed the NPU efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1229", "title": "Edge Data Quality Gate Design for Exynos NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should the system validate ISP frames for blur and lighting before waking a heavy authentication model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1230", "title": "On-device Data Curation for Active Learning", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should Gemini Nano on Tensor G3 curate low-confidence smart replies locally without exposing private text?", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 2}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1231", "title": "On-Device Drift Detection Architecture for Shared Memory Systems", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Exynos 2400 perform on-device drift detection without caching raw frames or starving camera bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1232", "title": "Architecting Real-Time Translation on Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an offline, real-time speech translation system for a device powered by the Google Tensor G3?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1233", "title": "Architecting an Energy-Efficient Real-time Translation Pipeline", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 translation pipeline (34.7 TOPS NPU, 12 GB shared RAM) minimize energy by balancing NPU compute and LPDDR5X access?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1234", "title": "Architecting Sub-4-bit LLM Deployment on Mobile NPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 7B translation LLM use extreme quantization while fitting memory and bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1235", "title": "On-Device Fairness Evaluation Architecture", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro photography feature compute local fairness metrics under strict 100MB memory and 10ms latency constraints?", "chain_ids": ["mobile-chain-auto-secondary-013-17"], "chain_positions": {"mobile-chain-auto-secondary-013-17": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1236", "title": "Architecting Federated PEFT for LLMs on Tensor G3", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a cross-device federated learning system to personalize an on-device LLM (like Gemini Nano) for predictive text formatting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1237", "title": "Tensor G3 LLM Degradation Ladder", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation architecture that ensures users always receive smart reply suggestions within a strict 300ms latency budget, regardless of system stress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1238", "title": "Exynos 2400 Graph Partitioning", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 AOT compiler partition graphs and plan memory while the ISP handles 4K video?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1239", "title": "A17 Pro NPU Distillation Pipeline Design", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a 7B teacher be distilled into an A17 Pro student optimized for INT8 real-time inference?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1240", "title": "A17 Pro Unified Memory KV-Cache Architecture", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a memory-safe KV-cache allocation and eviction policy that allows processing up to 4,000 tokens of context without exceeding the remaining 500 MB budget?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 4}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1241", "title": "Zero-Copy Memory Mapping for Gemini Nano on Tensor G3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should edge system services share model weights to avoid exceeding RAM limits and TPU cold-start stalls?", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1242", "title": "Architecting OOM-Resilient On-Device LLM Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an LLM assistant manage a growing KV cache under a dynamic 4 GB AI memory budget without triggering an OOM kill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1243", "title": "Architecting Mixed-Precision Inference on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What mixed-precision strategy fits an A17 Pro segmentation model within 500MB while preserving accuracy and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1244", "title": "On-Device CI/CD for Video Super-Resolution", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the end-to-end MLOps lifecycle to automate the compilation, hardware-in-the-loop profiling, and deployment of these models targeting an NPU with shared memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1245", "title": "On-Device Multimodal Model Conversion for Hexagon NPU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a deployment handle unsupported custom attention operators without CPU fallback bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1246", "title": "Architecting a Vision-Language Model on Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 3B Snapdragon vision-language model fit a 4GB memory budget and handle bandwidth-bound decoding?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 2}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1247", "title": "On-Device Telemetry Design for Exynos 2400 Shared Memory", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Exynos 2400 telemetry detect memory-contention stragglers caused by ISP load without adding shared-memory overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1248", "title": "Heterogeneous Operator Scheduling for Neural Engine", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a scheduler partition video operators across compute units without exceeding shared memory bandwidth and thermal limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1249", "title": "Architecting Latency Profiling on Snapdragon 8 Gen 3 NPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a Snapdragon 8 Gen 3 profiler identify compute versus memory bottlenecks with under 2% bandwidth overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1250", "title": "Designing Structured Sparsity for NPU Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a pruning and sparsity architecture to fit this model within memory limits while maximizing NPU throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1251", "title": "On-Device Guardrails for GenAI on Exynos 2400", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a continuous monitoring system that evaluates the tradeoffs between sequential and concurrent guardrail execution to guarantee safety without degrading the user experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1252", "title": "Architecting an AR Segmentation Pipeline on Snapdragon 8 Gen 3", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 4K super-resolution pipeline use roofline analysis to guarantee it stays compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1253", "title": "Architecting ASIL-B Driver Monitoring on Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the system to ensure deterministic execution, detailing memory isolation, watchdogs, and fallback if the NPU hangs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1254", "title": "Architecting Real-Time Audio Ingestion for Hexagon NPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the data ingestion, buffering, and processing pipeline, justifying your decisions on where each stage of the pipeline should execute to balance latency and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1255", "title": "Architecting a Thermal-Aware Real-Time Video Processing Pipeline", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Propose a system-level design to maintain acceptable real-time performance indefinitely under sustained thermal constraints without severely degrading the user experience.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1256", "title": "Diagnosing OOM in On-Device LLM Shadow Rollout", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Tensor G3 shadow rollout of an LLM cause OOM and UI stutter, and what rollout architecture avoids it?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1257", "title": "Diagnosing Thermal Throttling from Adversarial Inputs on Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do certain Gemini Nano inputs trigger rapid thermal throttling and throughput collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1258", "title": "A17 Pro Neural Engine Latency Spike Diagnosis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can Neural Engine latency spike above 40ms for a 150 GOPS INT8 segmentation model expected at 4.5ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1259", "title": "Diagnosing Model Collapse on Tensor G3", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a personalized writing assistant become repetitive after federated updates with synthetic data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1260", "title": "On-Device Active Learning OOM Crashes", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can a 50MB CoreML selection model still trigger OOM crashes during camera-frame curation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1261", "title": "Diagnosing On-Device Drift Detection OOM on Hexagon NPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does background PSI drift detection cause OOM on Snapdragon 8 Gen 3 despite stable INT8 inference?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1262", "title": "Diagnosing Thermal Throttling from Memory Bound NPU Workloads", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Snapdragon 8 Gen 3 ViT drain battery and throttle after 10 minutes despite meeting FPS and TOPS targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1263", "title": "Diagnosing Latency Bias in On-Device LLMs", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Gemini Nano transcription on Tensor G3 show higher latency and battery drain for AAVE than SAE transcripts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1264", "title": "Diagnosing A17 Pro Neural Engine OOM Under Thermal Throttling", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an A17 Pro video app encounter sudden OOM crashes during thermal degradation after a period of stable memory use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1265", "title": "Diagnosing Slow Distilled LLM Generation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is a 1B distilled LLM on Tensor G3 barely faster than a 3B pruned model during autoregressive generation?", "chain_ids": ["mobile-chain-auto-secondary-014-20"], "chain_positions": {"mobile-chain-auto-secondary-014-20": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1266", "title": "Diagnosing Page Thrashing on Apple A17 Pro", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an mmap-backed 3GB A17 Pro language model spike to over 2.5 seconds after switching from the camera app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1267", "title": "Diagnosing OOM Evictions on Shared Memory Exynos 2400", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of these sudden evictions considering the system's shared memory architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1268", "title": "Diagnosing FP8 Activation Overflow on Tensor G3 TPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing this catastrophic degradation, and how would you resolve it while maintaining target latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1269", "title": "CI/CD Pipeline Graph Break Diagnosis", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What pipeline validation flaw led to this discrepancy, and what is the physical root cause on the device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1270", "title": "NPU Memory Bandwidth Contention", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Exynos 2400 NPU P99 latency spike 5x when the ISP processes high-resolution video?", "chain_ids": ["mobile-chain-auto-001-09"], "chain_positions": {"mobile-chain-auto-001-09": 0}, "chain_tiers": {"mobile-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1271", "title": "Diagnosing OOM on Shared Memory NPU Architectures", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an 8B INT8 Exynos 2400 LLM crash when a camera-based multimodal feature starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1272", "title": "Diagnosing Latency Spikes on Snapdragon 8 Gen 3", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the root cause of these symptoms using your knowledge of this heterogeneous architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1273", "title": "Shared Memory Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the hardware architecture, how do you diagnose the root cause of this latency degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1274", "title": "On-Device PII Guardrail Starvation on Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 3B INT8 assistant on Snapdragon 8 Gen 3 starve a lightweight PII guardrail despite 30% NPU utilization?", "chain_ids": ["mobile-chain-auto-secondary-011-13"], "chain_positions": {"mobile-chain-auto-secondary-011-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1275", "title": "Diagnosing Exynos NPU Latency Spikes", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an Exynos 2400 safety model occasionally spike from 12ms to over 40ms, and how can determinism be restored?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1276", "title": "NPU Frame Drops During Real-Time Sensor Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does NPU frame processing stutter at 30 FPS with only 30% NPU utilization but high memory latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1277", "title": "Diagnosing TPU Throttling During Sustained Gemini Nano Generation", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Gemini Nano on Tensor G3 slow by over 60% after 45s while LPDDR5X memory use stays flat?", "chain_ids": ["mobile-chain-auto-secondary-013-21"], "chain_positions": {"mobile-chain-auto-secondary-013-21": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1278", "title": "Evaluating Adversarial Defenses on Snapdragon NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which adversarial defense is more efficient on Snapdragon 8 Gen 3: an INT8 NPU ensemble or randomized smoothing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1279", "title": "Evaluating Attention Variants for Exynos 2400 NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the memory bandwidth and compute tradeoffs for each variant during the decoding phase and recommend the best architecture for this specific SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1280", "title": "Evaluate Vision Transformer Deployment Cost on Exynos 2400 NPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model do you choose for a target 60 FPS constraint, and how do you justify the compute cost tradeoffs?", "chain_ids": ["mobile-chain-auto-secondary-004-09"], "chain_positions": {"mobile-chain-auto-secondary-004-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1281", "title": "Evaluating Data Pruning for On-Device NPU Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which data selection approach is better suited for the heterogeneous Snapdragon 8 Gen 3 environment: CPU-based coreset selection via embeddings or NPU-based gradient loss selection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1282", "title": "Calibration Data Selection for INT8 NPU Execution", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which calibration dataset strategy is better for maximizing overall INT8 accuracy on this dual-core NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1283", "title": "On-Device Drift Detection Alternatives", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which local drift detection approach is optimal for the Apple A17 Pro considering power and memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1284", "title": "On-Device Summarization Architecture Selection", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For Exynos 2400 meeting summarization, should a 2B INT8 model be decoder-only or encoder-decoder to minimize memory bandwidth contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1285", "title": "Evaluating DRAM vs Compute Energy for A17 Pro", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model do you choose to maximize battery life, and why?", "chain_ids": ["mobile-chain-auto-secondary-013-12"], "chain_positions": {"mobile-chain-auto-secondary-013-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1286", "title": "Thermal Degradation on Shared Memory NPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Under a 50% NPU throttle, should tracking load an INT8 fallback model or drop from 60 FPS to 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1287", "title": "Distillation vs Pruning for Snapdragon Hexagon NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use 70% unstructured pruning or a 3x smaller dense student for 10ms video segmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1288", "title": "Evaluating PagedAttention vs Static Allocation for On-Device LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a Snapdragon 8 Gen 3 7B LLM use a 2 GB contiguous KV cache or PagedAttention within an 8 GB app budget?", "chain_ids": ["mobile-chain-bucket-kvcachem-05"], "chain_positions": {"mobile-chain-bucket-kvcachem-05": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1289", "title": "LLM KV Cache Allocation Strategy on Tensor G3", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 1.8B LLM on Tensor G3 use static or PagedAttention-style KV allocation under a 2.5 GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1290", "title": "Evaluating Mixed-Precision ViT on Snapdragon 8 Gen 3", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which precision strategy ensures a 1B parameter ViT achieves 30 FPS on a Snapdragon 8 Gen 3 Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1291", "title": "LLM TPU Delegation vs CPU Fallback on Tensor G3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which TFLite strategy is faster: 100% TPU delegation with 2.2B params or 30% CPU fallback at 2.0B?", "chain_ids": ["mobile-chain-auto-001-08"], "chain_positions": {"mobile-chain-auto-001-08": 1}, "chain_tiers": {"mobile-chain-auto-001-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1292", "title": "On-Device LLM Evaluation for Tensor G3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Tensor G3 summarization model option should be chosen after considering weights, KV cache, and TPU constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1293", "title": "Heterogeneous Pipelining on Snapdragon 8 Gen 3", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 30 FPS segmentation graph run the FP16 head on CPU sequentially or pipeline it on GPU?", "chain_ids": ["mobile-chain-auto-secondary-012-08"], "chain_positions": {"mobile-chain-auto-secondary-012-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-08": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1294", "title": "On-Device LLM Profiling Alternatives for Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is Tensor G3 LLM generation at 150 ms/token limited by 7.5 TOPS TPU compute or 12 GB LPDDR5X bandwidth?", "chain_ids": ["mobile-chain-auto-secondary-009-03"], "chain_positions": {"mobile-chain-auto-secondary-009-03": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1295", "title": "Evaluating On-Device Toxicity Guardrails for A17 Pro", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which toxicity guardrail design is better for a mobile deployment prioritizing strict PII privacy and low latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1296", "title": "Safety-Critical LLM Fallback Design on Tensor G3", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Tensor G3 hazard-alert fallback design meets a 150ms safety recovery SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1297", "title": "Continuous Vision Thermal Throttling Strategy", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which thermal strategy better sustains continuous 30 FPS segmentation: burst-and-sleep or minimum-clock continuous execution?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1298", "title": "Evaluating Shadow vs. Canary on Snapdragon 8 Gen 3", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a Snapdragon 8 Gen 3 video model rollout use shadow deployment or canary, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1299", "title": "A17 Pro Neural Engine vs GPU for Real-Time Video", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a segmentation model with GPU-supported custom ops run on the GPU or the NPU?", "chain_ids": ["mobile-chain-auto-secondary-007-12"], "chain_positions": {"mobile-chain-auto-secondary-007-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1300", "title": "Mmap Strategies for Shared Memory", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an 8GB FP16 model on a shared-memory architecture use mmap plus mlock or demand-paged mmap for predictable cold starts?", "chain_ids": ["mobile-chain-auto-secondary-014-05"], "chain_positions": {"mobile-chain-auto-secondary-014-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-05": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1301", "title": "CI/CD for On-Device Models on Apple A17 Pro Neural Engine", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate these proposals and determine which is better for production deployment?", "chain_ids": ["mobile-chain-auto-secondary-006-20"], "chain_positions": {"mobile-chain-auto-secondary-006-20": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1302", "title": "Hardware-Aware NAS for A17 Pro Neural Engine", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which A17 Pro NAS proposal yields lower real latency, FLOP-minimizing MBConv or hardware-profiled dense search within a ~5W power budget?", "chain_ids": ["mobile-chain-auto-secondary-010-12"], "chain_positions": {"mobile-chain-auto-secondary-010-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-010-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1303", "title": "A17 Pro Canary Rollout Performance Budget", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum FPS and memory footprint define the A17 Pro canary model theoretical performance budget if the model requires 0.5 TOPS/frame and 15% memory?", "chain_ids": ["mobile-chain-auto-secondary-011-12"], "chain_positions": {"mobile-chain-auto-secondary-011-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1304", "title": "A17 Pro Memory Bounds for Adversarial Defense", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum adversarial-training batch size fits on the target device after OS, model, and per-sample overheads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1305", "title": "Depthwise Separable Speedup on Exynos NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many MACs do standard and depthwise separable convolutions require, and what is the estimated latency assuming the NPU runs at 10% utilization of its 34.7 TOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1306", "title": "Tensor G3 Gemini Nano Prefill Latency", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the pure compute latency for Tensor G3 to prefill a 500-token prompt with a 1.5B INT8 Gemini Nano model?", "chain_ids": ["mobile-chain-auto-secondary-004-07"], "chain_positions": {"mobile-chain-auto-secondary-004-07": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1307", "title": "On-Device Coreset Sizing for A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum coreset size fits a 2-second fine-tuning budget over 10 epochs on a 35 TOPS NPU?", "chain_ids": ["mobile-chain-auto-secondary-014-17"], "chain_positions": {"mobile-chain-auto-secondary-014-17": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1308", "title": "NPU Data Starvation and Pipelined Throughput", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What FPS results from sequential versus double-buffered CPU/NPU processing on a 45 TOPS NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1309", "title": "On-Device Data Quality Gate Compute Budget", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much Exynos 2400 NPU capacity is consumed by four concurrent anomaly-detection quality-gate streams?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1310", "title": "On-Device Active Learning Compute for Curation", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the total daily compute time in seconds required on the 7.5 TOPS TPU to score and curate this dataset.", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 0}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1311", "title": "On-Device KL Divergence Calculation", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the KL divergence KL(P||Q) from memory, and determine if the 34.7 TOPS NPU is bottlenecked by compute or the LPDDR5X memory bandwidth for this operation?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1312", "title": "Encoder vs Decoder Prefill Compute on Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do decoder-only and encoder-decoder models compare on prefill compute and theoretical minimum latency on the Tensor G3?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1313", "title": "Energy Cost of Memory vs Compute on Exynos 2400", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much per-inference energy comes from compute versus LPDDR5X memory access for a 50M parameter INT8 model on Exynos 2400?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1314", "title": "3-bit AWQ Footprint on Exynos 2400", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the memory footprint of the weights and the compute required (in TOPS) to generate 20 tokens per second, and evaluate if the NPU compute capacity is a limiting factor?", "chain_ids": ["mobile-chain-auto-secondary-013-15"], "chain_positions": {"mobile-chain-auto-secondary-013-15": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1315", "title": "On-Device Equal Opportunity Evaluation Performance", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the required compute time and energy to run a 14,000-image Equal Opportunity fairness audit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1316", "title": "On-Device LLM Battery Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does the proposed degradation ladder keep the system fail-operational under the 15% battery fail-safe mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1317", "title": "Tensor G3 Constant Folding Latency", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many milliseconds of compute latency does a 20% constant folding reduction save?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1318", "title": "A17 Pro NPU Distillation Throughput", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum IPS and energy per inference does the distilled student achieve at 60% NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1319", "title": "KV-Cache Sizing for A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory does a 4096-token 7B LLM need (32 layers, 32 heads, 128 head dimension), and does it fit in the remaining RAM?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1320", "title": "Memory-Mapped On-Device LLM Initialization", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the required memory bandwidth to achieve 15 tokens per second and determine if the physical RAM is sufficient to hold the model alongside runtime activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1321", "title": "NPU Shared Memory Batching Limit", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum batch size you can safely process in a single pass before triggering OS-level memory eviction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1322", "title": "FP16 Memory and Compute on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Do FP16 weights for a 1.5B A17 Pro model fit in memory, and what is the compute-bound step throughput?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1323", "title": "CI/CD Compute Gating for Exynos NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum theoretical FPS this model can achieve if the CI/CD gate policy strictly requires leaving 60% of the NPU compute available for concurrent ISP workloads?", "chain_ids": ["mobile-chain-auto-secondary-006-19"], "chain_positions": {"mobile-chain-auto-secondary-006-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1324", "title": "Hexagon NPU Delegation Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total inference latency given the CPU fallback and transition overheads?", "chain_ids": ["mobile-chain-auto-001-03"], "chain_positions": {"mobile-chain-auto-001-03": 0}, "chain_tiers": {"mobile-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1325", "title": "INT8 Model Memory Footprint on Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What total memory footprint does a 3B INT8 model need with 25% KV/activation overhead, and does it fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1326", "title": "Parallel Schedule Energy Latency", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What inference latency and energy result from parallelizing 105 GOPs on the A17 Pro Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1327", "title": "Hexagon NPU Profiling Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the Hexagon NPU theoretical latency and utilization for a 225 GOPS model measured at 15ms?", "chain_ids": ["mobile-chain-auto-secondary-009-04"], "chain_positions": {"mobile-chain-auto-secondary-009-04": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1328", "title": "Structured Pruning Latency on Hexagon NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the exact NPU utilization percentage required for this model after pruning, given the Hexagon's peak INT8 compute capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1329", "title": "On-Device Guardrail Latency and Memory", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the memory footprint and theoretical compute latency for a 1B INT8 guardrail classifier on the Exynos 2400?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1330", "title": "Watchdog Timer Sizing for Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What watchdog timeout should be set for a 90 GOPS Hexagon NPU frame with a 3x contention safety margin?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1331", "title": "Calculate sustained FPS under A17 Pro thermal throttling", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What sustained FPS can an A17 Pro object tracker reach under a 1.5W ANE thermal power limit, assuming linear power scaling?", "chain_ids": ["mobile-chain-auto-secondary-013-23"], "chain_positions": {"mobile-chain-auto-secondary-013-23": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1332", "title": "A17 Pro KV-Cache Memory Budgeting", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What memory is needed for 2B INT8 weights plus a 1000-token FP16 KV cache, and does it fit within a 3GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1333", "title": "LLaMA-3 8B INT8 Quantization Memory Budget on Snapdragon 8 Gen 3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the memory footprint for an 8B INT8 model plus a 2048-token KV cache, and does it fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1334", "title": "Exynos 2400 NPU vs CPU Audio Offloading", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the Exynos 2400 NPU handle a 4 TOPS audio workload after sustained bandwidth limits, and what utilization is needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1335", "title": "Shadow Deployment Compute Utilization on Hexagon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What Hexagon NPU compute utilization results from running Models A and B together at 30 FPS in shadow mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1336", "title": "A17 Pro NPU Latency and Energy Calculation", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the expected inference latency in milliseconds and the energy consumed per inference in millijoules to quantify the benefit of NPU offloading?", "chain_ids": ["mobile-chain-auto-secondary-007-12"], "chain_positions": {"mobile-chain-auto-secondary-007-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1337", "title": "KV Cache Capacity on Exynos 2400", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum sequence length you can support if you implement standard Multi-Head Attention (MHA) versus Multi-Query Attention (MQA) with 1 KV head?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1338", "title": "MobileNetV2 FLOPs vs Apple A17 Pro TOPS", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical maximum FPS can a 1.5 GFLOPs MobileNetV2 model achieve on A17 Pro at 30% utilization?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1339", "title": "Estimating Inference FPS on Exynos 2400 NPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What FPS can the Exynos 2400 process for a 150 GOPS super-resolution model at 35% utilization?", "chain_ids": ["mobile-chain-auto-secondary-004-09"], "chain_positions": {"mobile-chain-auto-secondary-004-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-09": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1340", "title": "Calculate Data Pruning Ratio for On-Device Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What exact data pruning ratio (coreset percentage) is required to complete the fine-tuning job within the 10-minute budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1341", "title": "Tensor G3 TPU Pipeline Throughput Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum sensor data throughput can Tensor G3 sustain at 80% TPU utilization and 50,000 ops per byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1342", "title": "Compute Budget for On-Device Frame Quality Gate", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What per-frame compute budget remains for a data quality gate after the main 60 FPS model uses 30 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1343", "title": "On-Device Active Learning Throughput", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If your uncertainty-scoring model requires 12.5 GOPS per inference, how many frames per minute can you evaluate to curate your dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1344", "title": "On-Device MMD Drift Window Sizing", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many embeddings and minutes of history fit in a 50MB A17 Pro drift-detection window at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1345", "title": "Compute Latency on Exynos 2400 NPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming a peak compute of 34.7 TOPS, how would you calculate the theoretical compute-bound inference latency for both architectures, and find the absolute difference in milliseconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1346", "title": "Calculate NPU Compute and Memory Energy Per Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much per-inference energy does the A17 Pro model spend on NPU compute and unified-memory weight fetches?", "chain_ids": ["mobile-chain-auto-secondary-013-12"], "chain_positions": {"mobile-chain-auto-secondary-013-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1347", "title": "Calculating 3-bit AWQ Memory Footprint", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total memory footprint of the weights and metadata, and does it fit within the 4.0 GB allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1348", "title": "Quantifying NPU Quantization Bias", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the Predictive Equality (False Positive Rate difference) gap for both models to quantify the fairness degradation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1349", "title": "Calculating Fallback Model Budget on Exynos 2400", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum GOPS per frame a fallback model can require to sustain a strict 30 FPS under 40% throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1350", "title": "Calculate Max FPS After Operator Fusion on ANE", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum FPS can the A17 Pro compiled video model achieve after a 20% operation reduction?", "chain_ids": ["mobile-chain-auto-secondary-003-06"], "chain_positions": {"mobile-chain-auto-secondary-003-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-003-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1351", "title": "Distillation Target Latency on Hexagon NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum GOPS per frame can a distilled Hexagon NPU speech enhancement student use to meet a 5ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1352", "title": "KV-Cache Memory Sizing for Hexagon NPU", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the exact memory required for the KV-cache of a 3B LLM (4096 tokens, FP16) on an NPU, and does it fit in 2.0 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1353", "title": "Calculate Maximum Context Length for Gemini Nano", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum context window can a 1.8B INT4 Gemini Nano service support under a 2.0 GB memory limit?", "chain_ids": ["mobile-chain-auto-secondary-014-24"], "chain_positions": {"mobile-chain-auto-secondary-014-24": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1354", "title": "Mixed-Precision Footprint on Hexagon NPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the exact mixed-precision model weight footprint and the theoretical minimum INT8 compute time per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1355", "title": "CI/CD Latency Gating for A17 Pro Neural Engine", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 450 GOPS candidate model pass a 50ms CI/CD latency gate at 30% hardware utilization on a 35 TOPS NPU?", "chain_ids": ["mobile-chain-auto-secondary-006-20"], "chain_positions": {"mobile-chain-auto-secondary-006-20": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1356", "title": "Tensor G3 LLM Memory Budgeting", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum standard quantization precision (in bits) we can use for the model weights to guarantee the model fits entirely within the remaining available RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1357", "title": "Hardware-Aware NAS Latency on Hexagon NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical compute latency for a 90M-MAC INT8 block on a 45 TOPS NPU?", "chain_ids": ["mobile-chain-auto-secondary-010-13"], "chain_positions": {"mobile-chain-auto-secondary-010-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1358", "title": "Operator Fusion Latency on Hexagon NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What execution time do unfused and fully fused NPU schedules take for the three-operator sequence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1359", "title": "Calculate Prefill Latency Bound on Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical minimum compute latency in milliseconds to process a 100-token prompt?", "chain_ids": ["mobile-chain-auto-secondary-009-03"], "chain_positions": {"mobile-chain-auto-secondary-009-03": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1360", "title": "Calculate 2:4 Structured Sparsity Speedup on Tensor G3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do 2:4 structured sparsity changes affect memory footprint and peak TOPS for the LLM deployment?", "chain_ids": ["mobile-chain-auto-secondary-006-33"], "chain_positions": {"mobile-chain-auto-secondary-006-33": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1361", "title": "On-Device Guardrail Compute and Memory Sizing", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the static memory footprint of the guardrail weights and the theoretical minimum compute latency for one 128-token chunk, assuming 100% Neural Engine utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1362", "title": "Calculate Gemini Nano Inference Bound on Tensor G3", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What operational intensity and token generation rate bound apply to batch-1 Gemini Nano inference on Tensor G3?", "chain_ids": ["mobile-chain-bucket-roofline-04"], "chain_positions": {"mobile-chain-bucket-roofline-04": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1363", "title": "Deterministic Execution Timing for On-Device LLM Watchdog", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What watchdog timeout should guarantee a 20-token emergency LLM response on Tensor G3 assuming 50% TPU utilization due to 12 GB LPDDR5X bandwidth constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1364", "title": "On-Device Real-Time Sensor Ingestion Throughput for G3 TPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the sensor ingestion buffer and throughput be sized for real-time health monitoring on a mobile TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1365", "title": "Hexagon NPU Burst Thermal Limit Calculation", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum number of tera-operations the NPU can execute in peak burst state before thermal throttling engages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1366", "title": "Calculate KV Cache Memory Footprint on Google Tensor G3", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory is required for a single generation request reaching a sequence length of 2048 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1367", "title": "On-Device Shadow Rollout for A17 Pro Video Segmentation", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an A17 Pro shadow rollout for video segmentation without frame drops or thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1368", "title": "Exynos 2400 NPU Heterogeneous Pipeline Design", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 heterogeneous pipeline map dense, dynamic, and preprocessing work across its processors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1369", "title": "On-Device Defensive Architecture for Biometric Spoofing", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an on-device defense against biometric spoofing for the A17 Pro Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1370", "title": "Designing a 32K Context Window for Google Tensor G3", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a novel attention architecture and memory management strategy that fits a 32K context window within a 2 GB budget while maximizing TPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1371", "title": "Exynos 2400 NPU Video Enhancement CNN Architecture", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a custom CNN architecture that maximizes utilization of the NPU's compute without bottlenecking the shared LPDDR5X memory during high-bandwidth concurrent ISP operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1372", "title": "On-Device LLM Sizing for Google Tensor G3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the model be sized and optimized across prefill, decode, and memory limits to meet the 2-second target?", "chain_ids": ["mobile-chain-auto-secondary-004-07"], "chain_positions": {"mobile-chain-auto-secondary-004-07": 2}, "chain_tiers": {"mobile-chain-auto-secondary-004-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1373", "title": "On-Device Data Pruning for Continual Learning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you build an on-device data pruning pipeline for continual learning without exceeding mobile resource budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1374", "title": "A17 Pro Unified Memory Pipeline Optimization", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a zero-copy data pipeline that maximizes throughput and minimizes power consumption while ensuring the NPU is never starved for data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1375", "title": "On-Device Data Validation for Continuous Learning", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a multi-stage data validation pipeline for continuous learning that strictly bounds memory bandwidth and NPU utilization?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1376", "title": "On-Device LLM Active Learning Curation on Tensor G3", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should an on-device active learning system curate LLM personalization data on Tensor G3?", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 3}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1377", "title": "On-Device DP-SGD on Hexagon NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a hybrid training architecture that handles per-example gradient clipping, secure noise generation, and INT8 quantization without compromising privacy guarantees on a Hexagon NPU with 16 GB of shared LPDDR5X memory?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 3}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1378", "title": "On-Device ISP Concept Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you detect ISP concept drift without dropping 4K 60 FPS camera frames?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1379", "title": "On-Device Real-Time Translation Architecture for Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a model and execution strategy to guarantee continuous translation under 100ms per utterance?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1380", "title": "Energy-Aware Wake Vision Architecture on Exynos 2400", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a 50 mW always-on wake vision cascade while minimizing DRAM access?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1381", "title": "Sub-3-bit LLM Deployment on Exynos NPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can an 8B assistant be quantized below 4 bits to fit the Exynos 2400 memory budget during heavy camera use?", "chain_ids": ["mobile-chain-auto-secondary-013-15"], "chain_positions": {"mobile-chain-auto-secondary-013-15": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-15": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1382", "title": "On-Device Intersectional Fairness", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the evaluation architecture for the A17 Pro to compute parity metrics across 16 subgroups without causing thermal throttling or battery drain?", "chain_ids": ["mobile-chain-auto-secondary-013-17"], "chain_positions": {"mobile-chain-auto-secondary-013-17": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1383", "title": "On-Device LLM Federated Personalization on Tensor G3", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you personalize an on-device LLM with federated LoRA under a 10 MB daily upload limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1384", "title": "Tensor G3 On-Device LLM Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you build a fail-operational degradation ladder for Tensor G3 translation and summarization under thermal and RAM pressure?", "chain_ids": ["mobile-chain-auto-secondary-012-04"], "chain_positions": {"mobile-chain-auto-secondary-012-04": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1385", "title": "A17 Pro Asymmetric Distillation for ASR", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you distill an ASR model asymmetrically for efficient execution on the NPU?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1386", "title": "Real-time Multimodal Assistant Latency on Hexagon NPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you schedule a real-time multimodal assistant on Hexagon NPU while avoiding shared memory bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1387", "title": "LLM Memory Co-Design on Exynos 2400 NPU", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design a memory hierarchy and execution architecture to guarantee 20 tokens/s generation without dropping camera frames?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 3}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1388", "title": "Zero-Copy LLM Architecture on Google Tensor G3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a memory-mapped inference architecture to achieve zero-copy weight sharing across processes?", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1389", "title": "On-Device Fine-Tuning Memory Orchestration", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should memory be orchestrated for on-device fine-tuning with a 4-bit base model and LoRA adapters to maintain the footprint under 2.0 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1390", "title": "On-Device FP8 Inference Design for LLM on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design mixed-precision FP8 activation inference for an on-device LLM on A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1391", "title": "On-Device Multimodal Architecture for Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a quantization and memory allocation strategy to fit a 7B LLM and ViT within the 12-16GB RAM and 64GB/s limits, guaranteeing 15 tok/s and 30 FPS without OS evictions?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 3}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1392", "title": "Edge Telemetry Architecture for On-Device LLMs", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design edge telemetry to isolate memory, thermal, or OS scheduler stragglers without hurting latency or uploading raw traces?", "chain_ids": ["mobile-chain-auto-secondary-006-22"], "chain_positions": {"mobile-chain-auto-secondary-006-22": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1393", "title": "Hardware-Aware NAS for Video Segmentation on Exynos 2400", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS strategy that optimizes for both latency and shared memory access?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1394", "title": "Heterogeneous Scheduling for Multimodal Pipeline", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a heterogeneous multimodal pipeline be scheduled to reduce memory traffic and NPU stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1395", "title": "Always-On Video NPU Power Optimization", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you optimize power for always-on video analytics by pacing NPU work instead of racing to idle?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 4}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1396", "title": "Heterogeneous Pipeline Profiling on Snapdragon 8 Gen 3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you profile this heterogeneous pipeline to find NPU stalls and memory bottlenecks, and what architectural changes eliminate the 15ms overhead?", "chain_ids": ["mobile-chain-auto-secondary-009-04"], "chain_positions": {"mobile-chain-auto-secondary-009-04": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1397", "title": "On-Device Guardrail Architecture for Exynos", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a concurrent architecture that ensures strict policy enforcement without degrading the user experience or starving the ISP during multimodal tasks?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 3}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1398", "title": "Safety-Critical ASIL-D Pedestrian Detection on Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect an ASIL-D pedestrian detection pipeline on Hexagon NPU with deterministic execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1399", "title": "Continuous On-Device Sensor Fusion and Inference Pipeline", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you build a continuous sensor fusion pipeline using low-power hubs, zero-copy buffers, and NPU bursts on a 45 TOPS INT8 NPU with 12-16 GB LPDDR5X memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1400", "title": "Continuous Real-Time Vision Under Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a thermal-aware execution architecture that processes 30 FPS continuously without ever triggering thermal throttling or dropping frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1401", "title": "On-Device LLM Budgeting for Snapdragon 8 Gen 3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a 7B on-device LLM be budgeted under a 6 GB memory cap while supporting a 4096-token context window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1402", "title": "A/B Rollout Memory Bottleneck on Exynos 2400", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the A/B rollout memory bottleneck on Exynos 2400 and how should it be fixed?", "chain_ids": ["mobile-chain-auto-secondary-011-11"], "chain_positions": {"mobile-chain-auto-secondary-011-11": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1403", "title": "Optimizing Adversarial Defense on Exynos NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the 5-pass Monte Carlo bottleneck on a shared-memory NPU, and how can it be optimized below 30ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1404", "title": "Optimizing Inverted Residuals on Google Tensor G3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is this block design inefficient on the Tensor G3, and what structural change improves utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1405", "title": "Diagnosing NPU Compute Bottlenecks on Snapdragon 8 Gen 3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the compute precision and shared memory architecture to diagnose the bottleneck and quantify the fix?", "chain_ids": ["mobile-chain-auto-secondary-004-08"], "chain_positions": {"mobile-chain-auto-secondary-004-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1406", "title": "On-Device Coreset Selection for Nightly Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should nightly fine-tuning data be scored and selected on-device using the Exynos 2400 dual-core NPU within a 100 TOPs budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1407", "title": "Optimizing On-Device Data Quality Gates for LLMs", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the bottleneck in this data validation pipeline and quantify a solution to optimize it while keeping the 7.5 TOPS TPU available?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1408", "title": "Optimizing On-Device Calibration Data Curation for NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate calibration data for NPU quantization without loading the full image corpus into memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1409", "title": "Optimizing On-Device Drift Detection for Google Tensor G3", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this computational bottleneck and quantify an optimized on-device drift detection strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1410", "title": "Optimizing Encoder-Decoder on Snapdragon NPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an encoder-decoder model be optimized on a mobile NPU for memory-bound autoregressive decoding?", "chain_ids": ["mobile-chain-auto-secondary-013-08"], "chain_positions": {"mobile-chain-auto-secondary-013-08": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1411", "title": "Optimizing Sub-4-bit LLM Deployment on Google Tensor G3", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize a sub-4-bit LLM deployment on Tensor G3 for memory-bandwidth-bound decoding?", "chain_ids": ["mobile-chain-auto-secondary-013-14"], "chain_positions": {"mobile-chain-auto-secondary-013-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1412", "title": "NPU Preemption and CPU Fallback Optimization", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design NPU preemption handling and CPU fallback without crashing or stalling the user experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1413", "title": "Distillation Projection Layer Bottlenecks", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are distillation projection layers a bottleneck on the NPU, and what student design avoids them?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1414", "title": "Optimizing KV-Cache Eviction for On-Device LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you cap and evict KV-cache entries for an on-device LLM while preserving useful context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1415", "title": "Zero-Copy Memory Mapping for Hexagon NPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory bottleneck and propose an optimization to eliminate the initialization spike?", "chain_ids": ["mobile-chain-auto-secondary-014-05"], "chain_positions": {"mobile-chain-auto-secondary-014-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1416", "title": "Mitigating Jetsam Evictions on A17 Pro NE", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory bottleneck and quantify an optimization strategy to prevent OOM evictions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1417", "title": "CI/CD Hardware Fallback on Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should CI/CD catch Tensor G3 TPU fallbacks before release, and what is the latency cost of this failure?", "chain_ids": ["mobile-chain-auto-secondary-006-18"], "chain_positions": {"mobile-chain-auto-secondary-006-18": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-18": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1418", "title": "CoreML Fallback Memory Transfer Bottleneck", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you eliminate CoreML CPU fallback for a custom attention operator on the A17 Pro Neural Engine?", "chain_ids": ["mobile-chain-auto-001-02"], "chain_positions": {"mobile-chain-auto-001-02": 1}, "chain_tiers": {"mobile-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1419", "title": "A17 Pro Unified Memory LLM Bottleneck", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the primary memory bottleneck for inference, why does an FP16 deployment fail, and what optimization is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1420", "title": "Hexagon NPU Telemetry Bottleneck", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign Hexagon NPU telemetry to avoid per-frame memory and CPU overhead on a Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1421", "title": "Dual-Core NPU Operator Scheduling for Memory Contention Mitigation", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should dual-core Exynos NPU operators be scheduled to reduce shared memory contention, and which operators should be parallelized?", "chain_ids": ["mobile-chain-auto-secondary-012-07"], "chain_positions": {"mobile-chain-auto-secondary-012-07": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1422", "title": "A17 Pro ANE Memory Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the A17 Pro Neural Engine reach only 5 TOPS of 35 TOPS, and what latency gain should layout fusion deliver?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1423", "title": "Structured Pruning for Apple A17 Pro Neural Engine", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does unstructured pruning fail to speed up inference, and what structured approach should replace it?", "chain_ids": ["mobile-chain-auto-secondary-006-31"], "chain_positions": {"mobile-chain-auto-secondary-006-31": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1424", "title": "Optimizing On-Device Toxicity Guardrails for Gemini Nano", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the guardrail bottleneck and quantify an optimized execution strategy on a mobile TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1425", "title": "Deterministic Execution on A17 Pro", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you make inference deterministic despite unified-memory contention from other subsystems?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1426", "title": "Optimizing High-Frequency Sensor Ingestion on A17 Pro", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the high-frequency IMU ingestion be optimized to unblock video processing and stay within a 5W envelope?", "chain_ids": ["mobile-chain-auto-secondary-012-11"], "chain_positions": {"mobile-chain-auto-secondary-012-11": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1427", "title": "Progressive Rollout of Gemini Nano A/B Experiment", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs between a strict 1% shadow deployment versus a phased geographic rollout, and quantify the resource thresholds that would trigger an automatic rollback?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1428", "title": "Hexagon NPU Sizing for On-Device LLM", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile NPU be evaluated for batch-1 on-device LLM inference to meet the 40 tokens/sec SLA?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1429", "title": "On-device LLM Guardrail Sizing for Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the memory and latency tradeoffs of deploying an auxiliary classifier specifically optimized for the G3's on-device TPU and 12 GB LPDDR5X memory footprint?", "chain_ids": ["mobile-chain-auto-secondary-009-01"], "chain_positions": {"mobile-chain-auto-secondary-009-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1430", "title": "A17 Pro GQA Sizing for On-Device LLMs", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Grouped-Query Attention (GQA) be sized to reduce the KV-cache footprint compared to Multi-Head Attention (MHA)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1431", "title": "Sizing Inverted Residuals for Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should inverted residual expansion ratios be chosen to hit a strict 15ms latency budget?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1432", "title": "Estimating Real-Time Video Segmentation Compute on A17 Pro", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can real-time video segmentation fit within the Apple A17 Pro compute and memory bandwidth budget without thermal throttling?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1433", "title": "On-Device LLM Coreset Sizing for Tensor G3", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you size this minimal dataset and select coreset strategies to ensure the fine-tuning process completes within a 1-hour overnight charging window while maximizing the Information-Compute Ratio?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1434", "title": "On-Device Data Validation Pipeline on Snapdragon 8 Gen 3", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you split on-device data validation between CPU and Snapdragon 8 Gen 3 NPU considering memory and compute constraints?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1435", "title": "On-Device Active Learning Data Selection", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a mobile device score a 30 FPS 1080p stream for active learning without disrupting foreground apps and shared memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1436", "title": "DP-SGD Memory Constraints on Exynos NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should DP-SGD micro-batches or ghost clipping be sized for NPU memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1437", "title": "On-Device Streaming Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you perform streaming drift detection on-device using NPU embeddings and CPU histograms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1438", "title": "Sizing On-Device Translation Architectures for A17 Pro", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an on-device translation architecture be sized for NPU memory-bandwidth limits?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1439", "title": "Energy Profiling for NPU Operator Selection", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you choose between lookup tables and polynomial approximations using NPU energy consumption principles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1440", "title": "Sizing W4A8 LLM Deployment on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you deploy a 7B LLM with W4A8 quantization under a 4 GB RAM limit, and what is its impact?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1441", "title": "On-Device LLM Intersectional Fairness Sizing", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you size an on-device LLM fairness evaluation so subgroup analysis does not exceed memory or latency budgets?", "chain_ids": ["mobile-chain-auto-secondary-013-16"], "chain_positions": {"mobile-chain-auto-secondary-013-16": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1442", "title": "On-Device LoRA with Apple A17 Pro", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you run federated LoRA updates for a 1.5B model on a mobile device within memory limits without degrading the battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1443", "title": "Thermal Degradation for Video Segmentation on A17 Pro", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a degradation ladder that transitions to a fallback state without dropping frames or exhausting unified memory?", "chain_ids": ["mobile-chain-auto-secondary-012-03"], "chain_positions": {"mobile-chain-auto-secondary-012-03": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1444", "title": "Exynos 2400 NPU Operator Lowering Tradeoffs", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should unsupported attention operators be lowered or refactored for Exynos 2400 NPU delegation to prevent severe memory bandwidth bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1445", "title": "Distilling LLMs for Tensor G3 TPU Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you distill an LLM for Tensor G3 TPU deployment using hardware-friendly student architecture choices?", "chain_ids": ["mobile-chain-auto-secondary-014-20"], "chain_positions": {"mobile-chain-auto-secondary-014-20": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1446", "title": "KV-Cache Sizing for On-Device LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design a KV-cache strategy and calculate the maximum context length for a 1.8B LLM within a 1.5 GB background memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1447", "title": "mmap-Driven LLM Streaming on A17 Pro Neural Engine", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate using memory-mapped files (mmap) to execute this model without being terminated by the OS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1448", "title": "Video Segmentation Memory Sizing on Exynos 2400", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the memory management strategy, address fragmentation, and size the buffers to ensure stable execution within a 500 MB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1449", "title": "On-Device LLM Precision Strategy for Tensor G3", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate precision formats and decide on a deployment recipe that fits the 3.5 GB limit while maximizing throughput?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1450", "title": "CI/CD Release Gates for Hexagon NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What CI/CD release gates should verify Hexagon NPU memory bandwidth, SRAM use, and CPU fallback risk to guarantee a 5ms SLA?", "chain_ids": ["mobile-chain-auto-secondary-006-21"], "chain_positions": {"mobile-chain-auto-secondary-006-21": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1451", "title": "Exynos 2400 NPU Delegation and Operator Fallback", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you rewrite an Exynos 2400 model graph to avoid NPU delegation fallback?", "chain_ids": ["mobile-chain-auto-001-04"], "chain_positions": {"mobile-chain-auto-001-04": 1}, "chain_tiers": {"mobile-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1452", "title": "Sizing an LLM for Exynos 2400 NPU Deployment", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 7B LLM be sized for Exynos 2400 using quantization and KV-cache budgeting?", "chain_ids": ["mobile-chain-auto-027-11"], "chain_positions": {"mobile-chain-auto-027-11": 1}, "chain_tiers": {"mobile-chain-auto-027-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1453", "title": "On-Device Telemetry Budgeting for A17 Pro Neural Engine", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you budget on-device telemetry for A17 Pro without sending raw logs or exceeding battery limits?", "chain_ids": ["mobile-chain-auto-secondary-006-24"], "chain_positions": {"mobile-chain-auto-secondary-006-24": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1454", "title": "Hardware-Aware NAS for Hexagon NPU Realization", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS search space to ensure the resulting architecture stays compute-bound without hitting NPU memory bandwidth limits?", "chain_ids": ["mobile-chain-auto-secondary-010-13"], "chain_positions": {"mobile-chain-auto-secondary-010-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1455", "title": "Evaluate A17 Pro Power Budgeting for Real-Time Video AI", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should real-time video AI be power-budgeted on A17 Pro to avoid sustained thermal throttling?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1456", "title": "NPU Memory Contention in Shared LPDDR5X Systems", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you evaluate the memory trace to identify the bottleneck, and what architectural adjustments do you make to hit the latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1457", "title": "NPU Sparsity Optimization for 4K Video", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the tradeoffs between applying 50% unstructured pruning versus 2:4 structured sparsity to achieve the 30 FPS target on the mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1458", "title": "Quantization Sizing for Exynos 2400 Shared Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What quantization plan fits a generative model into the Exynos 2400 shared memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1459", "title": "Sizing On-Device Guardrails on Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you size on-device guardrail inference on Hexagon NPU for low-latency generated text checks?", "chain_ids": ["mobile-chain-auto-secondary-011-13"], "chain_positions": {"mobile-chain-auto-secondary-011-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1460", "title": "On-Device LLM Roofline Analysis on Tensor G3", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Using the roofline model, evaluate whether autoregressive decoding (batch size 1) is compute-bound or memory-bound, and predict the impact of INT4 weight-only quantization.", "chain_ids": ["mobile-chain-bucket-roofline-04"], "chain_positions": {"mobile-chain-bucket-roofline-04": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1461", "title": "Deterministic Driver Monitoring on Exynos 2400", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you make the driver monitoring deterministic with watchdogs and pinned NPU resources so it never misses a deadline?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1462", "title": "Real-Time Sensor Fusion Ingestion Sizing", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you size real-time sensor fusion ingestion using shared memory ring buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1463", "title": "Sustained Thermal Budgeting for Continuous LLMs", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you sustain continuous LLM audio processing within Tensor G3 thermal limits?", "chain_ids": ["mobile-chain-auto-secondary-013-21"], "chain_positions": {"mobile-chain-auto-secondary-013-21": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1464", "title": "Sizing KV-Cache for On-Device LLMs on Tensor G3", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should KV-cache and model weights be sized for a Tensor G3 on-device LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1465", "title": "Exynos 2400 Shared Memory Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Exynos 2400 shared memory be allocated for a 3B LLM and its KV cache to support a 2048-token context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1467", "title": "Hexagon NPU INT8 Peak Performance Recall", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the advertised peak theoretical INT8 compute capacity of the Hexagon NPU for roofline modeling?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1468", "title": "Hardware-Based Model Extraction via TPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the specific term for this class of hardware-based security attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1469", "title": "MobileNet Depthwise Separable Convolution Recall", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific type of convolution operation, first popularized by the MobileNetV1 architecture, factorizes a standard convolution into two separate layers to drastically reduce computational cost?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1470", "title": "A17 Pro Neural Engine Specification Recall", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the key Apple A17 Pro Neural Engine compute and memory specifications for mobile ML sizing?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1471", "title": "Define Coresets for On-Device Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What ML term describes a small representative subset for Tensor G3 on-device personalization under tight memory limits?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1472", "title": "Hexagon NPU Peak Throughput Data Type", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What specific numerical data type must the input schema validate against to natively utilize the peak 45 TOPS capability of this NPU?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1473", "title": "On-Device Curation Memory Limit", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the total unified memory capacity shared between the CPU, GPU, and Neural Engine on the Apple A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1474", "title": "Hexagon NPU Capacity for Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What Hexagon NPU compute and shared-memory capacities are critical constraints for background drift detection?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1475", "title": "Encoder vs Decoder Bottlenecks on A17 Pro", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "By definition, which phase (encoder prefill or decoder generation) is typically compute-bound, and which is memory-bandwidth-bound?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Encoder prefill is compute-bound; decoder generation is memory-bandwidth-bound.", "Encoder prefill is memory-bandwidth-bound; decoder generation is compute-bound.", "Both phases are heavily compute-bound due to the 35 TOPS Neural Engine.", "Both phases are heavily memory-bandwidth-bound due to the unified memory pool."], "correct_index": 0}}, {"id": "mobile-1476", "title": "Definition of Equalized Odds for On-Device Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the exact definition of equalized odds in the context of fairness evaluation?", "chain_ids": ["mobile-chain-auto-secondary-013-16"], "chain_positions": {"mobile-chain-auto-secondary-013-16": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1477", "title": "A17 Pro Unified Memory Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the total capacity of the shared unified memory available across the CPU, GPU, and 16-core Neural Engine on the A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-012-03"], "chain_positions": {"mobile-chain-auto-secondary-012-03": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1478", "title": "Recall Knowledge Distillation Basics for Tensor G3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is knowledge distillation and how does it train a smaller Tensor G3 student model?", "chain_ids": ["mobile-chain-auto-secondary-014-20"], "chain_positions": {"mobile-chain-auto-secondary-014-20": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1479", "title": "Tensor G3 KV Cache Paged Allocation", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific memory management technique stores KV cache in non-contiguous physical blocks to eliminate external fragmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1480", "title": "Apple A17 Pro Unified Memory Recall", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific memory architecture does the A17 Pro use to share its 8 GB pool between the CPU, GPU, and Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1481", "title": "Exynos 2400 Unified Memory Capacity Recall", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the total shared LPDDR5X memory capacity on the Exynos 2400 that the NPU shares with the CPU and GPU?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1482", "title": "Google Tensor G3 TPU 16-bit Format Recall", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the name of this format, and how many bits does it allocate to the exponent versus the fraction?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1483", "title": "Exynos 2400 Shared Memory Architecture Recall", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the specific memory architecture of the Samsung Exynos 2400, and how does its capacity affect the maximum feasible model size?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1484", "title": "Tensor G3 Memory Constraints for Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the total available system RAM on the Google Tensor G3, and what specific on-device LLM is this platform optimized to run concurrently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1485", "title": "Exynos 2400 NPU Shared Memory Architecture", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which shared-memory metrics should be monitored to profile Exynos 2400 NPU bottlenecks?", "chain_ids": ["mobile-chain-auto-secondary-009-05"], "chain_positions": {"mobile-chain-auto-secondary-009-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1486", "title": "Unstructured vs Structured Pruning on Exynos 2400 NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Why does unstructured pruning usually fail to speed up inference on mobile NPUs, and when does structured pruning help?", "chain_ids": ["mobile-chain-auto-secondary-006-34"], "chain_positions": {"mobile-chain-auto-secondary-006-34": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1487", "title": "Edge Guardrail NPU Performance Recall", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To properly document the hardware assumptions and latency constraints for the edge deployment, what is the advertised peak INT8 performance of this specific NPU class?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1488", "title": "Hardware Safety Mechanism Recall", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the standard industry term for this hardware safety mechanism?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1489", "title": "Exynos 2400 Shared Memory Architecture", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the hardware specifications of modern mobile SoCs, what type of memory architecture does the NPU use to access streaming data alongside the CPU and GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1490", "title": "Google Tensor G3 TPU Peak Performance Recall", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For burst compute tasks before thermal throttling engages, what is the specified peak performance in TOPS for the Google Tensor G3's on-device TPU?", "chain_ids": ["mobile-chain-auto-secondary-013-21"], "chain_positions": {"mobile-chain-auto-secondary-013-21": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1491", "title": "Shadow Deployment Design for Exynos 2400", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify an Exynos 2400 shadow deployment that protects latency and memory headroom without causing memory starvation or thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1492", "title": "Designing On-Device Defenses for Exynos 2400 NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design on-device adversarial defenses for the NPU without blowing the latency or memory bandwidth budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1493", "title": "Designing Mobile Attention for Hexagon NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an attention architecture specification that fits within the NPU's capabilities and memory limits while maintaining acceptable generation quality?", "chain_ids": ["mobile-chain-auto-secondary-010-10"], "chain_positions": {"mobile-chain-auto-secondary-010-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1494", "title": "Real-Time Object Detection on Tensor G3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a real-time object detector for Tensor G3 that balances compute and memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1495", "title": "On-Device Inference Cost on Snapdragon 8 Gen 3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate on-device inference cost on Snapdragon 8 Gen 3 from compute and memory limits to meet a 20 tokens/sec requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1496", "title": "On-Device Coreset Selection for Image Personalization", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Design a data pruning and selection pipeline that maximizes the Information-Compute Ratio (ICR) without causing memory exhaustion.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1497", "title": "On-Device Real-Time Video Pipeline Design for A17 Pro", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time A17 Pro video pipeline using zero-copy unified memory to meet these requirements without excessively draining the battery?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 2}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1498", "title": "On-Device Data Quality Gates for LLMs", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design quality gates to filter out random pocket-dials, PII, and gibberish without degrading device performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1499", "title": "Data Curation for INT8 Hexagon NPU Calibration", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate an INT8 calibration dataset that captures long-tail edge cases considering the shared 12-16 GB LPDDR5X memory?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 1}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1500", "title": "On-Device DP-SGD Specification for A17 Pro Text Prediction", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify on-device DP-SGD text prediction on A17 Pro with private clipping and noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1501", "title": "On-Device Drift Detection for Translation Models", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a privacy-preserving drift detection system for Tensor G3 translation models using local embeddings and federated analytics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1502", "title": "On-Device Summarization Architecture Selection for Snapdragon 8 Gen 3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which architecture should Snapdragon 8 Gen 3 use for long-input, short-output summarization and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1503", "title": "Energy-Aware Memory Access Design on Tensor G3", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an inference specification that minimizes energy per operation, specifically focusing on memory access versus compute?", "chain_ids": ["mobile-chain-auto-secondary-013-13"], "chain_positions": {"mobile-chain-auto-secondary-013-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1504", "title": "Federated Learning Specification for Exynos 2400", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a federated learning client specification that respects shared memory and thermal constraints on the Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1505", "title": "Real-Time Translation Degradation on Snapdragon 8 Gen 3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a real-time translation degradation ladder on Snapdragon 8 Gen 3 to guarantee continuous availability under severe thermal or memory constraints?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1506", "title": "Exynos 2400 NPU Distillation Specification", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you specify distillation for an Exynos 2400 NPU-friendly student model?", "chain_ids": ["mobile-chain-auto-secondary-014-21"], "chain_positions": {"mobile-chain-auto-secondary-014-21": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1507", "title": "Memory-Mapped NPU Inference Design on Snapdragon 8 Gen 3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a memory-mapped inference system to achieve zero-copy weight loading and avoid redundant memory allocations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1508", "title": "A17 Pro Memory Specification for On-Device LLM", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an A17 Pro on-device LLM memory plan handle weights, KV cache, and activations given the OS and background apps consume a baseline of 3.5 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1509", "title": "Mixed-Precision Video Super-Resolution on Exynos 2400", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you specify mixed-precision (INT8/FP16) video super-resolution on Exynos 2400 to maintain fidelity without exceeding the 34.7 TOPS budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1510", "title": "CI/CD Pipeline Design for On-Device LLM on Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What CI/CD checks catch LLM latency, memory, and consistency regressions before OTA release?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1511", "title": "CoreML Conversion and ANE Delegation Strategy", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert a model to CoreML while avoiding NPU delegation fallbacks, keeping execution within the thermal budget?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 3}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1512", "title": "On-Device LLM Sizing for Apple A17 Pro", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a 7B LLM be sized and partitioned to fit within the 8 GB unified memory while efficiently leveraging the Neural Engine?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 1}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1513", "title": "Dual-Core NPU Operator Scheduling for Memory Contention", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should dual-core Exynos NPU scheduling interleave compute-bound and memory-bound operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1514", "title": "Profiling ANE Bottlenecks for Real-Time Video Segmentation", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile A17 Pro Neural Engine bottlenecks for real-time video segmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1515", "title": "Designing a Sparsity Strategy for A17 Pro ANE", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What pruning structure gives real A17 Pro ANE speedups for a 1.2B vision-language model at 30 FPS and 2W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1516", "title": "Architecting for the Exynos 2400 NPU Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect a 60 FPS real-time segmentation model for an NPU to remain compute-bound under shared-memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1517", "title": "Designing a Fail-Safe Pedestrian Detection System", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a fail-safe pedestrian detection system with watchdogs and fallback inference if the Neural Engine hangs or encounters a fatal memory error within the shared 8 GB unified memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1518", "title": "Real-Time Sensor Ingestion for AR Tracking", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you ingest AR camera and IMU streams in real time using unified-memory ring buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1519", "title": "Continuous Video Analytics Thermal Throttling Design", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design continuous video analytics to maintain sustained performance under thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1520", "title": "LLM Memory Budget Specification for A17 Pro", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the exact memory budget breakdown required to support a 2048-token context window, and how should the remaining 5 GB be allocated for weights, KV-cache, and activations?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1521", "title": "Roofline Analysis for Mobile AI: Optimizing Real-time Object Detection on Apple A17 Pro", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use roofline analysis to diagnose this performance bottleneck and optimize real-time object detection?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 3}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1522", "title": "Roofline Analysis for Mobile AI Accelerator: Optimizing a Vision Model on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Roofline Analysis to diagnose whether the model is compute- or memory-bound and propose optimizations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1523", "title": "Exynos 2400 NPU Roofline Analysis for Mobile Workload Optimization", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you perform a roofline analysis for a mobile generative workload on Exynos 2400 NPU to determine if it is compute or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1524", "title": "Roofline Analysis on Apple A17 Pro: Identifying Bottlenecks", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using the Roofline Model, how would you diagnose whether this poor utilization is due to compute-bound or memory-bound limitations?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 2}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1525", "title": "Roofline Analysis for MobileNetV3 on Google Tensor G3", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using roofline math, is a 100 GOPS model moving 1GB per inference compute-bound or memory-bound, and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1526", "title": "Optimizing Mobile LLMs: Snapdragon 8 Gen 3 NPU Capabilities", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the architectural advantages and disadvantages of using the Hexagon NPU (45 TOPS) for a 7B INT4/INT8 LLM compared to the device's GPU or CPU?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1527", "title": "Mobile AI Inference Sizing on Snapdragon NPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should mobile AI inference on an NPU be sized using empirical profiling beyond peak TOPS?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1528", "title": "A17 Pro Neural Engine Inference Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the theoretical maximum inference throughput of the A17 Pro Neural Engine for the model?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1529", "title": "Tensor G3 On-Device Inference Costing", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you cost on-device inference across profiling, quantization, and memory limits for a 500 GFLOP LLM?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1530", "title": "Mobile AI Inference Cost on Exynos 2400", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you calculate Exynos 2400 NPU utilization and memory bandwidth for a mobile AI workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1531", "title": "Snapdragon 8 Gen 3 NPU Inference Cost Optimization", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate FLOPs, NPU-hours, and subsequent energy costs for an INT8 LLM on the Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-004-08"], "chain_positions": {"mobile-chain-auto-secondary-004-08": 2}, "chain_tiers": {"mobile-chain-auto-secondary-004-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1532", "title": "Evaluating Model Deployment on Samsung Exynos 2400 NPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Evaluate which architecture is more suitable and quantify the maximum sustainable FPS for each model on the given hardware.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1533", "title": "Google Tensor G3 VRAM Budget Components", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do model weights, optimizer states, intermediate activations, and the KV-cache each consume and impact the overall 12 GB LPDDR5X memory budget?", "chain_ids": ["mobile-chain-auto-secondary-012-14"], "chain_positions": {"mobile-chain-auto-secondary-012-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1534", "title": "Optimizing LLM Memory Footprint on Apple A17 Pro for On-Device Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a strategy to fit this model within the memory constraints while maximizing inference speed and minimizing power consumption?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1535", "title": "Diagnosing OOM on Tensor G3: VRAM Budgeting for Large Models", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the most likely culprits for the OOM error, considering all major memory components?", "chain_ids": ["mobile-chain-auto-secondary-012-14"], "chain_positions": {"mobile-chain-auto-secondary-012-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1536", "title": "VRAM Budgeting for a Mobile ML Model on Samsung Exynos 2400 NPU", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you budget mobile model memory across weights, activations, and training state to fit within 12 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1537", "title": "VRAM Budgeting for 7B LLM Inference on Apple A17 Pro", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you budget a 7B LLM for Apple A17 Pro unified memory and explicitly account for KV-cache growth?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1538", "title": "VRAM Budgeting for On-Device LLM Inference on Google Tensor G3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline your approach to estimate and manage the VRAM budget effectively?", "chain_ids": ["mobile-chain-auto-secondary-012-14"], "chain_positions": {"mobile-chain-auto-secondary-012-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1539", "title": "On-Device LLM Deployment: Apple A17 Pro VRAM Budgeting for Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you budget for model weights, activations, and the KV-cache for a maximum sequence length of 2048 tokens to balance memory efficiency versus performance?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1540", "title": "Apple A17 Pro KV-Cache Sizing and Memory Pressure for LLM Inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does KV-cache sizing impact overall system memory availability for other applications and the operating system?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1541", "title": "KV-Cache Memory Optimization on Snapdragon 8 Gen 3 for Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory pressure causes and propose strategies to optimize KV-cache management for the Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1542", "title": "On-Device LLM KV-Cache Optimization for Google Tensor G3", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a KV-cache management strategy that optimizes inference performance and mitigates memory pressure on the 12GB constraint for long contexts?", "chain_ids": ["mobile-chain-bucket-kvcachem-03"], "chain_positions": {"mobile-chain-bucket-kvcachem-03": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1543", "title": "Diagnosing KV-Cache Memory Pressure on Apple A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose KV-cache memory pressure on Apple A17 Pro during long-context LLM use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1544", "title": "Optimizing KV-Cache on Snapdragon 8 Gen 3 for LLMs with Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive KV-cache management strategy for a 7B LLM on a mobile NPU to efficiently support 4096-token contexts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1545", "title": "KV-Cache Pressure on Google Tensor G3 for Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a mobile LLM manage KV cache for 2048-token context to avoid latency spikes and OOMs?", "chain_ids": ["mobile-chain-bucket-kvcachem-03"], "chain_positions": {"mobile-chain-bucket-kvcachem-03": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1546", "title": "KV-Cache Optimization for Large Language Models on Edge NPUs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an edge NPU KV-cache strategy balancing paged allocation and pre-allocation under strict memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1547", "title": "Optimizing Large Language Model Deployment on Apple A17 Pro for Cold Start and Shared Memory", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a strategy to leverage memory-mapped files for efficient weight loading, shared memory across processes, and techniques to mitigate cold start issues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1548", "title": "Cold Start Optimization for Large Generative Models on Apple A17 Pro", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a memory-mapping strategy to minimize cold start latency for a large generative model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1549", "title": "Optimizing LLM Inference with Memory-Mapped Weights on Samsung Exynos 2400 NPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize memory-mapped LLM weights on Exynos 2400 to reduce cold-start latency and efficiently share memory across processes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1550", "title": "Designing for Memory Pressure on Google Tensor G3", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the LLM for real-time inference while avoiding OOM errors and minimizing fragmentation?", "chain_ids": ["mobile-chain-auto-secondary-014-24"], "chain_positions": {"mobile-chain-auto-secondary-014-24": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-24": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1551", "title": "Memory Optimization for 7B LLM Inference on Samsung Exynos 2400 NPU", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What minimum integer quantization bit-width is required to fit the weights, and what is the KV cache size for 128 tokens?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1552", "title": "Optimizing Large Generative AI Model Deployment on Samsung Exynos 2400 NPU under Memory Constraints", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate host offloading versus activation recomputation to fit a 15GB peak memory LLM into the Exynos 2400's 12GB RAM?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1553", "title": "Optimizing Large Language Models on Google Tensor G3 for Mobile", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize large language model deployment on Tensor G3 for mobile memory and latency?", "chain_ids": ["mobile-chain-auto-secondary-014-24"], "chain_positions": {"mobile-chain-auto-secondary-014-24": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1554", "title": "Analyzing On-Device LLM Latency on Google Tensor G3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you decompose on-device LLM latency on Tensor G3 into TTFT, TPOT, and processing overheads?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 3}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1555", "title": "Apple A17 Pro On-Device ML Latency Decomposition for Real-Time AR", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you decompose Apple A17 Pro ML latency for a real-time AR pipeline to find bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1556", "title": "Latency Decomposition for Mobile ML on Snapdragon NPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile and decompose latency components to determine which architecture offers better real-time performance?", "chain_ids": ["mobile-chain-auto-019-04"], "chain_positions": {"mobile-chain-auto-019-04": 2}, "chain_tiers": {"mobile-chain-auto-019-04": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1557", "title": "Optimizing On-Device ML Inference Latency on Samsung Exynos 2400 NPU for Mobile Applications", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you decompose and optimize end-to-end Exynos 2400 NPU inference latency for a mobile app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1558", "title": "Optimizing Real-Time Semantic Segmentation Latency on Samsung Exynos 2400 NPU", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline specific implementation strategies focusing on both compute and memory to achieve consistent sub-33ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1559", "title": "Real-Time ML Inference on Apple A17 Pro: Frame Budgeting for Jank Prevention", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the ML inference pipeline to strictly adhere to the 16.67ms frame budget on the A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1560", "title": "Real-Time ML Inference and WCET Analysis on Snapdragon NPU for AR/VR", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you perform WCET (Worst-Case Execution Time) analysis for real-time ML inference on the NPU to prevent AR jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1561", "title": "Diagnosing ML Model Latency on Google Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically diagnose whether the primary bottleneck is compute, memory, or I/O bound on this specific hardware platform?", "chain_ids": ["mobile-chain-auto-secondary-009-03"], "chain_positions": {"mobile-chain-auto-secondary-009-03": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1562", "title": "Profiling Latency in a Mobile NPU Application", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What specific metrics would you monitor and how would you interpret the data to distinguish between compute, memory, and I/O bound issues on an NPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1563", "title": "Optimizing Real-Time ML Inference on Apple A17 Pro for Low Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile this application on an Apple Neural Engine to identify bottlenecks and achieve the 30ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1564", "title": "Optimizing a Transformer Model on Snapdragon Hexagon NPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and optimize a transformer model bottleneck on a mobile NPU to reach the 100ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1565", "title": "Real-time Semantic Segmentation on Exynos 2400: Latency Bottleneck Design", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile and identify latency bottlenecks to preemptively meet the 30ms constraint on the Exynos 2400?", "chain_ids": ["mobile-chain-auto-secondary-009-05"], "chain_positions": {"mobile-chain-auto-secondary-009-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1566", "title": "Optimizing Object Detection Latency on Apple A17 Pro", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use mobile profiling tools to find why an object detector takes 60ms instead of the 33ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1567", "title": "Optimizing Real-time ML Inference Latency on Snapdragon NPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What specific profiling tools and techniques would you employ to pinpoint the exact causes of latency spikes and evaluate the two architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1568", "title": "Optimizing On-Device Object Detection Latency on Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize on-device object detection latency using profiling and model changes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1569", "title": "Optimizing a Vision Model on Samsung Exynos 2400 NPU for Low Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically profile to determine if the bottleneck is compute-bound, memory-bound, or I/O-bound, and identify the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1570", "title": "Optimizing Real-time Object Detection Latency on Apple A17 Pro", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which system metrics and profiling tools matter most when 80ms average inference latency peaks catastrophically to 200ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1571", "title": "Designing an INT8 Quantization Strategy for On-Device Object Detection on Google Tensor G3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an INT8 quantization strategy for Tensor G3 on-device object detection to meet <50ms latency and <200MB footprint constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1572", "title": "Quantizing a Large Language Model for Samsung Exynos 2400 NPU Deployment", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive quantization strategy to meet these quantitative memory and throughput constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1573", "title": "Optimizing LLM Deployment on Apple A17 Pro: Quantization Strategy Deep Dive", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Describe your strategy for quantizing this LLM to achieve real-time inference while minimizing memory footprint and energy consumption.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1574", "title": "Optimizing LLM Inference on Snapdragon 8 Gen 3 with INT4 Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize Snapdragon 8 Gen 3 LLM inference with INT4 quantization to achieve sub-200ms TPOT and reduce memory footprint?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1575", "title": "Google Tensor G3: Mixed-Precision Strategy for On-Device LLM Inference", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the primary considerations and trade-offs for choosing between FP16, BF16, and FP8 mixed-precision formats for LLM inference on this hardware?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1576", "title": "Optimizing Large Language Model Inference with Mixed-Precision on Apple A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which mixed-precision plan gets an FP32 LLM under 100 ms/token on an NPU within 8 GB and ~5W?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1577", "title": "Snapdragon Hexagon NPU Mixed-Precision LLM Deployment", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the percentage reduction in memory footprint, how many maximum parameters can fit in 12 GB using FP8, and what are the speed/accuracy trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1578", "title": "Mixed-Precision LLM Deployment on Samsung Exynos 2400 NPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a mixed-precision inference and fine-tuning strategy for a 7B LLM on the Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1579", "title": "Optimizing LLM Inference on Apple A17 Pro with Mixed Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you optimize Apple A17 Pro LLM inference with mixed precision to meet latency and energy budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1580", "title": "Optimizing LLM Deployment on Snapdragon 8 Gen 3 Hexagon NPU with Mixed Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which quantization approach is mandatory for this mobile deployment scenario, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1581", "title": "Optimizing Mixed-Precision Inference on Exynos 2400 NPU for Mobile CV", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose bottlenecks and quantify mixed-precision optimization gains for a CV model on the Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1582", "title": "Sub-4-bit LLM Deployment on Google Tensor G3", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 14 GB FP16 7B LLM be compressed below 4 bits for Tensor G3 without losing critical task accuracy?", "chain_ids": ["mobile-chain-auto-secondary-013-14"], "chain_positions": {"mobile-chain-auto-secondary-013-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1583", "title": "Extreme Sub-4-bit Quantization on Samsung Exynos 2400 NPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate extreme sub-4-bit quantization for an LLM on Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1584", "title": "Extreme Quantization for On-Device LLM: A17 Pro Evaluation", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate extreme quantization options for an on-device LLM on Apple A17 Pro considering power, performance, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1585", "title": "Deploying Sub-4-bit LLMs on Exynos 2400 NPU: Precision, Performance, and Trade-offs", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the key trade-offs you'd consider, and how would you evaluate the effectiveness of different extreme quantization techniques specific to this hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1586", "title": "Apple A17 Pro ML Inference Power Optimization", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you determine the optimal operating frequency and voltage for the Neural Engine and unified memory to maximize inferences per joule while staying within the 2.5W budget?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1587", "title": "Optimizing Quantized LLM Inference on Snapdragon 8 Gen 3 Hexagon NPU for Power Efficiency", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize quantized LLM inference on a mobile NPU for power efficiency to meet a 20 ms latency and 3W power cap?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 4}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1588", "title": "Optimizing ML Inference Power on Google Tensor G3", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize Tensor G3 ML inference power without sacrificing latency targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1589", "title": "Thermal Management and Sustained Performance of Samsung Exynos 2400 NPU", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do mobile thermal management strategies impact the sustained versus burst performance profile of an NPU during continuous inference?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1590", "title": "Optimizing LLM Inference on Snapdragon 8 Gen 3 NPU under Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Snapdragon 8 Gen 3 schedule burst and sustained LLM inference when thermal throttling lowers NPU throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1591", "title": "Diagnosing Sustained Performance Degradation on Exynos 2400 NPU", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this sustained performance degradation and propose solutions?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1592", "title": "Samsung Exynos 2400 NPU Thermal Constraints for Sustained ML Inference", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you handle NPU thermal constraints for sustained ML inference?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 4}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1593", "title": "Energy Optimization on Samsung Exynos 2400 NPU: Memory vs. Compute Costs", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize NPU energy by balancing memory access versus compute cost using Horowitz energy principles?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1594", "title": "Optimizing Neural Network Inference on Apple A17 Pro for Energy Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the system to minimize energy per operation, considering the Horowitz energy table, the energy cost of memory access versus compute, and energy-aware operator selection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1595", "title": "Energy Modeling for On-Device INT8 Convolution on Snapdragon Hexagon NPU", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you model INT8 convolution energy on Snapdragon Hexagon NPU across compute and memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1596", "title": "Energy-Aware LLM Deployment on Samsung Exynos 2400 NPU", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the model's energy profile considering the Horowitz energy table principles and the disparity between NPU compute and LPDDR5X memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1597", "title": "Optimizing On-Device ML Energy Consumption on Apple A17 Pro", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize an A17 Pro model's energy consumption considering Horowitz principles and hardware operator efficiency?", "chain_ids": ["mobile-chain-auto-secondary-013-12"], "chain_positions": {"mobile-chain-auto-secondary-013-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1598", "title": "Optimizing Transformer Inference on Samsung Exynos 2400 NPU: Attention Complexity and KV-Cache Management", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the fundamental architectural and scaling challenges of standard Transformer models that contribute to these issues on this specific mobile platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1599", "title": "Apple A17 Pro: Optimizing Transformer Inference for Low-Power Mobile Devices", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize transformer inference on Apple A17 Pro for low-power mobile deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1600", "title": "Snapdragon NPU LLM Deployment: Cost Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the memory footprint and theoretical sequence length limit for a 7B parameter LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1601", "title": "Optimizing Large Transformer Inference on Snapdragon 8 Gen 3 NPU", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What architectural and algorithmic optimizations would you propose to manage memory bandwidth, computational limits, and KV-cache on the NPU's INT8 capabilities and limited memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1602", "title": "A17 Pro Neural Engine and Depthwise Separable Convolutions", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should depthwise separable convolutions be used to improve mobile inference efficiency without destroying accuracy?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1603", "title": "Optimizing On-Device Object Detection for Google Tensor G3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign an on-device object detection model to meet sub-50ms latency while preserving accuracy and efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1604", "title": "MobileNetV3 Latency Anomaly on Apple A17 Pro Neural Engine", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why MobileNetV3-Large is taking about 200ms on the A17 Pro despite a low FLOP count?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1605", "title": "Optimizing Large Context Attention for Mobile NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Compare these two attention mechanisms focusing on their implications for memory bandwidth, computational efficiency, and latency on this specific hardware. What are the key trade-offs and how would you evaluate them?", "chain_ids": ["mobile-chain-auto-secondary-010-10"], "chain_positions": {"mobile-chain-auto-secondary-010-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-010-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1606", "title": "Optimizing Large Language Model Attention for Apple A17 Pro", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you modify LLM attention and memory handling so an 8192-token model can run in real time on the A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1607", "title": "Model Footprint on Snapdragon Hexagon NPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate the minimum memory footprint of this model on the NPU and discuss whether it's feasible given the available memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1608", "title": "On-Device LLM Deployment Feasibility on Google Tensor G3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze whether this model can run effectively on the NPU, and what are the key considerations for achieving optimal performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1609", "title": "Mobile Model Deployment Feasibility: Samsung Exynos NPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you assess whether MobileVisionNet-L can meet 30 FPS on the Exynos 2400 NPU given memory and throughput limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1610", "title": "A17 Pro Mobile Model Memory Footprint", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Propose a simplified convolutional neural network architecture and estimate its peak memory footprint to determine if it is feasible given the 500 MB constraint.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1611", "title": "Diagnosing Large Language Model Deployment on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the potential reasons for the memory errors and outline a plan to determine if this model, or a derivative, can be successfully deployed on the specified hardware?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 1}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1612", "title": "LLM Deployment Feasibility on Mobile NPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are your immediate concerns, and how would you perform a quick estimate to determine if deploying a 7B parameter LLM on a 12 GB device is achievable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1613", "title": "Evaluating Transformer Architectures for On-Device Deployment on Apple A17 Pro", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare the two INT8 Transformer models for feasibility on A17 Pro across memory, throughput, and power?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 2}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1614", "title": "Hardware-Aware NAS for Samsung Exynos 2400 NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you approach a hardware-aware Neural Architecture Search (NAS) to find an optimal model architecture, specifically considering TOPS and memory constraints, and the implications of MCUNet-style approaches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1615", "title": "A17 Pro Hardware-Aware NAS Performance Analysis", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose why a hardware-aware NAS model still exhibits high latency and power draw despite low theoretical FLOPs?", "chain_ids": ["mobile-chain-auto-secondary-010-12"], "chain_positions": {"mobile-chain-auto-secondary-010-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1616", "title": "Hardware-Aware NAS for Real-time Mobile AR", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS search for real-time AR segmentation on the Snapdragon 8 Gen 3 Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1617", "title": "Hardware-aware NAS for Mobile LLM Deployment on Google Tensor G3", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What memory penalty term would you add to an LLM NAS fitness function for 4-bit weights and an 8 GB peak memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1618", "title": "Hardware-Aware NAS for On-Device Deployment on Samsung Exynos 2400 NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you define the search space, objective, and search strategy for NAS under strict latency and memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1619", "title": "Snapdragon 8 Gen 3: Encoder-Decoder Architecture Tradeoffs for On-Device AI", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the latency, memory, and power tradeoffs among encoder-only, decoder-only, and encoder-decoder models?", "chain_ids": ["mobile-chain-auto-secondary-013-08"], "chain_positions": {"mobile-chain-auto-secondary-013-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1620", "title": "Encoder-Decoder Tradeoffs for On-Device NPU Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which encoder, decoder, or encoder-decoder architecture would you choose for an Exynos 2400 AI assistant and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1621", "title": "On-Device Real-time Translation with Apple A17 Pro: Architecture Tradeoffs", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the tradeoffs for each approach considering the A17 Pro's specific hardware constraints and the service requirements?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1622", "title": "Optimizing Encoder-Decoder for On-Device Mobile Deployment on Google Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a system architecture, considering the tradeoffs between encoder-only, decoder-only, and full encoder-decoder approaches, specifically for the Google Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1623", "title": "Mobile LLM Architecture Tradeoffs", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you justify your architectural choice, and what back-of-the-envelope calculations would you perform to validate its feasibility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1624", "title": "Mobile LLM Architecture Tradeoffs on Apple A17 Pro", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare a hybrid specialized-model approach against a single decoder-only LLM for an A17 Pro assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1625", "title": "Optimizing Encoder-Decoder Architectures for Google Tensor G3 On-Device NLU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose performance bottlenecks for each architecture on the Tensor G3 and quantify costs to make your recommendation?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1626", "title": "Differentiating Pruning Techniques for Mobile ML Acceleration on Tensor G3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do unstructured and structured pruning differ for speedups, implementation complexity, and TPU friendliness?", "chain_ids": ["mobile-chain-auto-secondary-006-33"], "chain_positions": {"mobile-chain-auto-secondary-006-33": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1627", "title": "Pruning for On-Device LLM Inference on Samsung Exynos 2400 NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do structured vs unstructured pruning methods align with the NPU's processing capabilities and memory bandwidth?", "chain_ids": ["mobile-chain-auto-secondary-006-34"], "chain_positions": {"mobile-chain-auto-secondary-006-34": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1628", "title": "Optimizing LLM Deployment on Snapdragon Hexagon NPU with Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use structured pruning to speed up an LLM on the Snapdragon Hexagon NPU while minimizing accuracy loss?", "chain_ids": ["mobile-chain-auto-secondary-006-32"], "chain_positions": {"mobile-chain-auto-secondary-006-32": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1629", "title": "Optimizing a Large Language Model for On-Device Deployment on Apple A17 Pro with Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you combine structured pruning and quantization to fit and accelerate a 7B LLM on the A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-006-31"], "chain_positions": {"mobile-chain-auto-secondary-006-31": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1630", "title": "Optimizing LLM Deployment on Snapdragon Hexagon NPU via Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose between different sparsity patterns to align with the NPU's capabilities and meet memory/latency targets?", "chain_ids": ["mobile-chain-auto-secondary-006-32"], "chain_positions": {"mobile-chain-auto-secondary-006-32": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1631", "title": "Optimizing Large Language Model Inference with Structured Sparsity on Google Tensor G3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a pruning and sparsity strategy for a Tensor G3 LLM that balances accuracy, latency, and power?", "chain_ids": ["mobile-chain-auto-secondary-006-33"], "chain_positions": {"mobile-chain-auto-secondary-006-33": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1632", "title": "Knowledge Distillation for Vision Model Deployment on Snapdragon 8 Gen 3 NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Design an implementation strategy detailing distillation techniques and how you would leverage the Snapdragon 8 Gen 3 NPU specs.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1633", "title": "Diagnosing Knowledge Distillation Deployment Issues on Google Tensor G3 for Mobile ML", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose intermittent latency spikes and high memory use in a distilled student model on Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1634", "title": "Optimizing Large Language Models for Apple A17 Pro with Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use knowledge distillation to create an A17 Pro-ready LLM student model within memory and power limits?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1635", "title": "Optimizing Knowledge Distillation for Vision Models on Samsung Exynos 2400 NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you explain when distillation would be preferred over pruning for this specific hardware and accuracy target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1636", "title": "Hexagon NPU Graph Compilation Analysis: Latency & Memory for Large Models", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which Hexagon NPU behaviors reveal poor operator lowering or constant folding in an INT8 AOT-compiled transformer?", "chain_ids": ["mobile-chain-auto-secondary-003-08"], "chain_positions": {"mobile-chain-auto-secondary-003-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-003-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1637", "title": "Tensor G3 On-Device ML Compiler Design for Real-time Vision", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Design a comprehensive graph compilation and optimization strategy using AOT compilation, operator lowering, and constant folding to hit 60 FPS on Tensor G3.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1638", "title": "Quantization and Operator Lowering for ViT on Exynos 2400 NPU", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the theoretical inference latency for a single image under each optimization strategy, assuming full NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1639", "title": "A17 Pro ML Compiler Design for Real-time Vision", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you leverage operator lowering, constant folding, and other compiler-driven optimizations to meet the 15ms latency and ~5W power constraints on the A17 Pro's Neural Engine?", "chain_ids": ["mobile-chain-auto-secondary-003-06"], "chain_positions": {"mobile-chain-auto-secondary-003-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-003-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1640", "title": "Optimizing LLM Deployment on Snapdragon Hexagon NPU via Graph Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do CustomSwish lowering and constant folding change total INT8 operations and latency on the Snapdragon Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-003-08"], "chain_positions": {"mobile-chain-auto-secondary-003-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-003-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1641", "title": "Optimizing Vision Transformer for Apple A17 Pro Neural Engine", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert and optimize a dynamic PyTorch vision transformer for low-latency A17 Pro Neural Engine execution?", "chain_ids": ["mobile-chain-auto-secondary-003-06"], "chain_positions": {"mobile-chain-auto-secondary-003-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-003-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1642", "title": "Operator Scheduling: Layer Fusion on Samsung Exynos 2400 NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is layer fusion in operator scheduling, and how does it reduce latency for a CNN on an NPU?", "chain_ids": ["mobile-chain-auto-secondary-012-07"], "chain_positions": {"mobile-chain-auto-secondary-012-07": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1643", "title": "Apple A17 Pro Neural Engine: Optimizing Operator Scheduling for Low Latency Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize operator scheduling for an LLM pipeline with idle gaps and CPU-Neural Engine transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1644", "title": "Optimizing MobileNetV3 Operator Scheduling on Hexagon NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule depthwise, pointwise, and activation operators for low-latency MobileNetV3 execution on Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1645", "title": "Tensor G3 Operator Scheduling for Memory and Throughput Optimization", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule and fuse OpA, OpB, and OpC on Tensor G3 to reduce peak memory versus naive execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1646", "title": "Diagnosing NPU Latency on Samsung Exynos 2400: Operator Scheduling Challenges", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the specific root causes of this performance bottleneck using NPU profiling tools?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1647", "title": "On-Device LLM Deployment on Apple A17 Pro: Operator Scheduling for Latency and Memory Optimization", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule operators in a 5B INT8 LLM on A17 Pro to meet a 50ms per token generation latency within 8 GB memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1648", "title": "LLM Operator Scheduling on Hexagon NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule an LLM block on the NPU to reduce latency, reuse memory, exploit parallelism, and apply layer fusion to meet a 100ms latency target?", "chain_ids": ["mobile-chain-auto-secondary-012-08"], "chain_positions": {"mobile-chain-auto-secondary-012-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1649", "title": "Optimizing Object Detection on Google Tensor G3: Operator Scheduling for MobileNetV3-SSD vs. EfficientDet-Lite", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize execution via operator scheduling, and which model achieves better throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1650", "title": "Hexagon NPU Transformer Scheduling for LLMs", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design transformer operator scheduling on the NPU to maximize throughput and minimize memory traffic?", "chain_ids": ["mobile-chain-auto-secondary-012-08"], "chain_positions": {"mobile-chain-auto-secondary-012-08": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1651", "title": "MLOps Artifacts for On-Device AI with Google Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What non-binary artifacts should a model registry version to keep Tensor G3 on-device AI training and serving consistent?", "chain_ids": ["mobile-chain-auto-secondary-006-18"], "chain_positions": {"mobile-chain-auto-secondary-006-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1652", "title": "MLOps Lifecycle for On-Device Deployment on Snapdragon Hexagon NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design CI/CD, monitoring, and efficiency metrics for an on-device object detector deployment?", "chain_ids": ["mobile-chain-auto-secondary-006-21"], "chain_positions": {"mobile-chain-auto-secondary-006-21": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1653", "title": "On-Device Model Drift Diagnosis on Google Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline your investigation steps, focusing on MLOps lifecycle elements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1654", "title": "MLOps Pipeline for Edge ML on Samsung Exynos NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive MLOps pipeline that ensures reliable and reproducible model delivery to the mobile devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1655", "title": "Optimizing Edge ML Deployment on Samsung Exynos NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically diagnose this 15 FPS bottleneck and quantify a specific optimization toward the 30 FPS target?", "chain_ids": ["mobile-chain-auto-secondary-006-19"], "chain_positions": {"mobile-chain-auto-secondary-006-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1656", "title": "Scalable MLOps for On-Device AR on Apple A17 Pro", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you build an MLOps pipeline for A17 Pro AR that preserves reproducibility, consistency, performance, and power limits?", "chain_ids": ["mobile-chain-auto-secondary-006-20"], "chain_positions": {"mobile-chain-auto-secondary-006-20": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1657", "title": "A17 Pro LLM Deployment with CoreML Operator Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline your strategy to address CoreML operator gaps and ensure efficient, low-latency execution within the hardware constraints?", "chain_ids": ["mobile-chain-auto-001-02"], "chain_positions": {"mobile-chain-auto-001-02": 2}, "chain_tiers": {"mobile-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1658", "title": "Diagnosing TFLite Performance Regressions on Google Tensor G3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose Tensor G3 TFLite latency spikes and accuracy shifts caused by conversion, op coverage, or delegation gaps?", "chain_ids": ["mobile-chain-auto-001-08"], "chain_positions": {"mobile-chain-auto-001-08": 0}, "chain_tiers": {"mobile-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1659", "title": "Exynos 2400 NPU Deployment for a Transformer-based Object Detector", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert, optimize, and validate a PyTorch object detector for 30 FPS on a mobile NPU?", "chain_ids": ["mobile-chain-auto-001-09"], "chain_positions": {"mobile-chain-auto-001-09": 1}, "chain_tiers": {"mobile-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1660", "title": "Optimizing Model Conversion for Apple A17 Pro Neural Engine", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you approach this problem to maximize performance and minimize power consumption on the A17 Pro, considering its specific hardware capabilities?", "chain_ids": ["mobile-chain-auto-001-12"], "chain_positions": {"mobile-chain-auto-001-12": 0}, "chain_tiers": {"mobile-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1661", "title": "NPU Model Deployment and Optimization on Snapdragon 8 Gen 3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design and compare the end-to-end conversion, optimization, and deployment pipelines for both models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1662", "title": "Optimizing LLM Deployment on Apple A17 Pro: CoreML Conversion & Operator Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you handle CoreML conversion gaps and CPU/GPU fallbacks to achieve real-time inference on the A17 Pro?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 5}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1663", "title": "Tensor G3 Model Rollout: Choosing a Strategy", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which rollout strategy would you use for a Tensor G3 on-device model update (7.5 TOPS TPU, 12 GB shared RAM) to safely collect metrics without degrading user battery life?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1664", "title": "Optimizing ML Model Rollouts on Mobile NPUs for a Mobile Application", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive A/B and rollout strategy for this new model, considering the unique constraints and opportunities presented by on-device inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1665", "title": "Canary Rollout Strategy for ML Model on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you determine the initial canary traffic percentage, and what NPU-centric metrics would you monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1666", "title": "Designing a Phased Rollout for Edge ML on Samsung Exynos 2400 NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the deployment process from initial testing to full global rollout, ensuring minimal user impact and effective performance monitoring?", "chain_ids": ["mobile-chain-auto-secondary-011-11"], "chain_positions": {"mobile-chain-auto-secondary-011-11": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1667", "title": "Evaluating Rollout Strategies for On-Device ML Model Architectures on Snapdragon Hexagon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you A/B test and progressively roll out two model architectures with different latency and operability to decide the final production model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1668", "title": "On-Device ML Model Canary Rollout on Google Tensor G3", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you monitor a mobile canary rollout's performance, ensure stability, and decide on a full rollout or rollback?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1669", "title": "Diagnosing & Quantifying Performance Bottlenecks in Canary Rollouts on Exynos 2400 NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the 15% latency regression and quantify the impact of a potential fix?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1670", "title": "ML Model Rollout on Apple A17 Pro with Progressive Deployment", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the model update be progressively rolled out while monitoring latency, heat, battery, and ensuring rollback mechanisms?", "chain_ids": ["mobile-chain-auto-secondary-011-12"], "chain_positions": {"mobile-chain-auto-secondary-011-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1671", "title": "On-device PSI Calculation for Predicted Class Drift on Snapdragon NPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you efficiently store hourly predicted-class counts and compute PSI for a 10-class on-device classifier?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1672", "title": "Diagnosing Data Drift on Google Tensor G3 for On-Device ML", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you distinguish data drift, concept drift, and training-serving skew for a Tensor G3 image classifier under low-light inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1673", "title": "On-Device Data Drift Detection for Real-time ML on Samsung Exynos 2400 NPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a robust, low-overhead on-device drift detector for an image model with reliable fallback triggers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1674", "title": "Drift Detection Strategies for Mobile NPU Deployments", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should Snapdragon 8 Gen 3 drift detection use server KL divergence, on-device PSI, or a hybrid design?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1675", "title": "Exynos NPU Drift: Optimizing On-Device Reliability", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you implement and optimize on-device drift detection without hurting primary inference latency?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1676", "title": "On-Device Drift Detection for Gesture Recognition", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design on-device drift detection for gesture recognition while minimizing battery and user impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1677", "title": "Graceful Degradation for On-Device ML on Samsung Exynos 2400", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design degradation ladders, model fallbacks, fail-safe modes, and QoS shedding for Exynos 2400 on-device ML?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1678", "title": "Graceful Degradation for On-Device ML with Snapdragon NPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you build a graceful degradation strategy for a mobile AI assistant under thermal, battery, and load constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1679", "title": "Adaptive NLU on Tensor G3: Resource-Aware Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation strategy for the NLU system to maintain critical functionality under dynamic resource constraints?", "chain_ids": ["mobile-chain-auto-secondary-012-04"], "chain_positions": {"mobile-chain-auto-secondary-012-04": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1680", "title": "Graceful Degradation for On-Device ML on Apple A17 Pro", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation strategy for this system, detailing degradation ladders, model fallbacks, and QoS shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1681", "title": "Graceful Degradation for On-Device AR on Google Tensor G3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which architecture would you recommend and why, specifically referencing the Tensor G3's capabilities, and what specific metrics would you monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1682", "title": "A17 Pro Vision Model Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you degrade an A17 Pro AR vision system under frame drops, latency, and battery drain while preserving core functionality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1683", "title": "Graceful Degradation for On-Device LLM on Snapdragon 8 Gen 3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you dynamically adjust the model's output quality or inference parameters to manage NPU and memory load through QoS shedding and degradation ladders?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1684", "title": "NPU Functional Safety for ISO 26262 ASIL B", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware-software co-design elements are needed for ISO 26262 ASIL B compliance on Snapdragon Hexagon?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1685", "title": "Tensor G3 ADAS Functional Safety Analysis (ASIL-B)", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should watchdogs, self-tests, and deterministic TPU execution make a Tensor G3 ADAS warning feature ASIL-B ready?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1686", "title": "Designing a Safety-Critical ML System on Samsung Exynos 2400 NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect a safety-critical pedestrian detection system on the NPU to satisfy ISO 26262 ASIL B?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1687", "title": "Real-time Neural Engine Safety Self-Test on Apple A17 Pro for ADAS (ASIL B)", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the maximum allowable duration for the critical self-test per 50ms cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1688", "title": "Designing a Safety-Critical Edge ML System for Autonomous Mobile Robotics on Google Tensor G3", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the ML pipeline and integrate system-level safety mechanisms to comply with functional safety principles, ensuring deterministic execution and high reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1689", "title": "Functional Safety for Autonomous Driving on Exynos 2400", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you implement self-tests and watchdog timers for ASIL D autonomous driving workloads on the Exynos 2400 NPU?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1690", "title": "Functional Safety Design for ADAS on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the NPU software architecture for ASIL-B emergency braking with deterministic execution and fault mitigation?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1691", "title": "A17 Pro NPU Adversarial Attack Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What on-device adversarial attack and lightweight defense would you choose for fraud detection on the A17 Pro Neural Engine?", "chain_ids": ["mobile-chain-auto-secondary-009-02"], "chain_positions": {"mobile-chain-auto-secondary-009-02": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1692", "title": "Mitigating Adversarial Attacks on On-Device ML for Google Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a robust system architecture to detect and mitigate adversarial threats on the Tensor G3 platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1693", "title": "Quantifying Adversarial Attack Impact on Mobile NPU Inference Latency", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate the latency increase from adversarial training on a MobileNetV3 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1694", "title": "Diagnosing Adversarial Impact on On-Device ML Reliability with Apple A17 Pro", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose an adversarial side-channel attack on an A17 Pro model running under high system load?", "chain_ids": ["mobile-chain-auto-secondary-009-02"], "chain_positions": {"mobile-chain-auto-secondary-009-02": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-02": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1695", "title": "Designing an Adversarially Robust Mobile ML System on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the system's architecture and defense strategies to guarantee a robust and secure operation within NPU constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1696", "title": "On-Device Adversarial Input Detection for Safety-Critical ML on Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you detect and mitigate input-level adversarial attacks on a real-time object detector without impacting performance?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1697", "title": "Mobile NPU Adversarial Robustness Evaluation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate adversarial training versus randomized smoothing for robustness, latency, and memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1698", "title": "Optimizing Adversarial Robustness on Snapdragon NPU for Mobile Reliability", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the specific bottleneck causing this performance degradation and propose an optimization strategy to restore application reliability, quantifying the expected improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1699", "title": "On-device Adversarial Attack Mitigation for Google Tensor G3 ML Model", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you defend a Tensor G3 harmful-content classifier against imperceptible adversarial perturbations within real-time limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1700", "title": "Monitoring On-Device ML Health on Snapdragon 8 Gen 3 NPU", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What key telemetry metrics would you collect from the NPU and application to monitor runtime health and detect degradation?", "chain_ids": ["mobile-chain-auto-secondary-006-23"], "chain_positions": {"mobile-chain-auto-secondary-006-23": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1701", "title": "Quantifying Straggler Impact on A17 Pro ML Inference Latency", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compute added latency from A17 Pro inference stragglers and define a production Straggler Impact Score?", "chain_ids": ["mobile-chain-auto-secondary-006-24"], "chain_positions": {"mobile-chain-auto-secondary-006-24": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1702", "title": "Diagnosing Latency Spikes on Snapdragon 8 Gen 3 NPU", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose sustained-load latency spikes in a Snapdragon 8 Gen 3 video analytics model using NPU telemetry?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1703", "title": "Diagnosing Stragglers on Mobile NPU with Telemetry", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you extend monitoring to find intermittent NPU stragglers and reduce MTTR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1704", "title": "Designing Reliable On-Device ML Monitoring for Apple A17 Pro", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect low-power monitoring for an on-device AI feature to catch regressions, stragglers, drift, and failures?", "chain_ids": ["mobile-chain-auto-secondary-006-24"], "chain_positions": {"mobile-chain-auto-secondary-006-24": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1705", "title": "Diagnosing and Quantifying NPU Bottlenecks on Snapdragon 8 Gen 3 with Telemetry", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you detect NPU stragglers, diagnose bottlenecks, and quantify optimization impact before rollout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1706", "title": "On-Device Real-time Sensor Fusion Pipeline for Apple A17 Pro", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a power-efficient A17 Pro sensor and camera data pipeline for immediate on-device inference?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 3}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1707", "title": "Optimizing On-Device Data Ingestion for Apple A17 Pro", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you eliminate the data loading and preprocessing bottlenecks for a 100 MB/s raw sensor stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1708", "title": "Optimizing On-Device ML Data Ingestion for Real-time Video on Google Tensor G3", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the potential bottlenecks in this mobile data pipeline and propose concrete optimization strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1709", "title": "Real-time Mobile ML Data Pipeline Optimization on Exynos 2400 NPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a 4K60 Exynos 2400 data pipeline that keeps the NPU fed without dropping frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1710", "title": "On-Device Data Schema Validation for Google Tensor G3", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What lightweight data schema validation strategy ensures sensor integrity before inference without burning the CPU budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1711", "title": "Data Corruption in Edge ML Model on Samsung Exynos 2400 NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and prevent malformed camera data from degrading an Exynos 2400 object detector?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1712", "title": "On-Device Data Quality & Validation for Health ML on Apple A17 Pro", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design on-device data quality, lineage, and anomaly detection for a health monitoring pipeline within a 5W continuous budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1713", "title": "On-Device ML Data Quality for Samsung Exynos 2400 NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Design a comprehensive data quality and validation system for this pipeline. How would you identify and handle unexpected patterns or outliers in the sensor data stream or model inferences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1714", "title": "On-Device Data Quality for A17 Pro Mobile ML", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a robust on-device data quality pipeline for high-frequency sensor data on the Apple A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1715", "title": "On-Device ML Data Quality: Centralized vs. Edge Validation on Snapdragon NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you discuss the trade-offs and recommend which validation approach is most suitable for a mission-critical mobile application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1716", "title": "On-Device Data Integrity and Schema Evolution for ML on Google Tensor G3", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you enforce evolving data contracts and on-device anomaly checks for Tensor G3 physiological sensor streams?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1717", "title": "Exynos NPU Data Contract Violations", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you fix NPU stalls caused by subtle sensor data contract violations in an AR pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1719", "title": "Optimizing On-Device Image Classification Dataset with Active Learning on Apple A17 Pro", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design an active learning pipeline that minimizes on-device cost while ensuring high-quality data selection?", "chain_ids": ["mobile-chain-auto-027-15"], "chain_positions": {"mobile-chain-auto-027-15": 0}, "chain_tiers": {"mobile-chain-auto-027-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1720", "title": "On-Device Object Detection Dataset Curation for Snapdragon 8 Gen 3 NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a continuous active learning strategy to identify and mitigate biases on-device using the Snapdragon 8 Gen 3 NPU?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 2}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1721", "title": "On-Device Active Learning for Image Classification on Mobile TPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design an on-device data selection strategy and annotation workflow to efficiently integrate new labels while respecting memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1722", "title": "Diagnosing Mobile Gesture Model Bias on Exynos NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose real-world gesture false negatives as a dataset curation and labeling problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1723", "title": "On-device Active Learning for Gesture Recognition Dataset Curation on Apple A17 Pro", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a privacy-preserving on-device active learning system for A17 Pro gesture recognition data curation?", "chain_ids": ["mobile-chain-auto-027-15"], "chain_positions": {"mobile-chain-auto-027-15": 1}, "chain_tiers": {"mobile-chain-auto-027-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1724", "title": "On-Device Rare Class Data Curation for Snapdragon NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How many expert annotation person-hours are needed to collect 5,000 rare-class instances at 30 seconds per image?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1725", "title": "On-Device Dataset Curation for Bias Mitigation on Snapdragon 8 Gen 3", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you leverage the NPU's capabilities for data selection, active learning, and annotation workflows to mitigate low-light and rural detection bias?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 3}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1726", "title": "Real-time Sensor Stream Processing on Apple A17 Pro", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you process high-frequency sensor streams for A17 Pro anomaly detection with low latency and low power?", "chain_ids": ["mobile-chain-auto-secondary-012-11"], "chain_positions": {"mobile-chain-auto-secondary-012-11": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1727", "title": "Optimizing Real-Time Sensor Processing on Snapdragon NPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the system behavior and explain why these issues are occurring, considering the NPU's specifications and the nature of mobile streaming data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1728", "title": "Real-time Physiological Anomaly Detection on Google Tensor G3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the stream processing and ML pipeline on the Tensor G3 to balance real-time anomaly detection with power efficiency?", "chain_ids": ["mobile-chain-auto-secondary-012-12"], "chain_positions": {"mobile-chain-auto-secondary-012-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1729", "title": "Real-Time Sensor Data Ingestion and Anomaly Detection on Edge NPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you ingest 1 kHz sensor data and run 1-second-window anomaly inference on the NPU within 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1730", "title": "Diagnosing High Latency in On-Device Streaming Feature Computation on Apple A17 Pro", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do sensor feature computations exceed 500 ms when raw ingestion is stable and the target is 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1731", "title": "Real-time Edge ML System Design for Mobile Sensor Data", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an edge ML system for real-time processing of high-frequency sensor data (e.g., IMU, audio, camera metadata) on a mobile device equipped with a Snapdragon 8 Gen 3 Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1732", "title": "On-Device Sensor Stream Processing with Google Tensor G3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a streaming data pipeline to handle 1000 samples/s ingestion, pre-processing, and ML inference on the Tensor G3 architecture?", "chain_ids": ["mobile-chain-auto-secondary-012-12"], "chain_positions": {"mobile-chain-auto-secondary-012-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1733", "title": "Real-Time Sensor Fusion on Exynos 2400: Edge vs. Hybrid Processing Architectures", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare pure edge and hybrid edge-cloud sensor processing for a fitness app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1734", "title": "Real-time Gesture Recognition on Google Tensor G3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the data ingestion and feature computation pipeline to meet the strict sub-50ms end-to-end latency constraint?", "chain_ids": ["mobile-chain-auto-secondary-012-12"], "chain_positions": {"mobile-chain-auto-secondary-012-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1735", "title": "Designing On-Device Data Efficiency for Personalized ML on Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the data selection, retention, and processing pipeline for an on-device personalized ML system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1736", "title": "Optimizing On-Device ML with Coreset Selection on Snapdragon 8 Gen 3", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you determine the maximum feasible coreset size for on-device retraining given a 1 hour target time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1737", "title": "On-Device Vision Model Data Optimization for Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What measures would you implement to prevent model collapse risks during aggressive data optimization on the A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1738", "title": "Optimizing On-Device ML with Coreset Selection on Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can coreset selection reduce the data footprint for efficient on-device personalization of an A17 Pro recommendation model?", "chain_ids": ["mobile-chain-auto-secondary-014-17"], "chain_positions": {"mobile-chain-auto-secondary-014-17": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1739", "title": "Federated Learning for On-Device LLM on Apple A17 Pro", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would your design orchestrate federated averaging for millions of A17 Pro devices handling non-IID data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1740", "title": "Federated LLM Personalization on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Detail the system architecture, data flow, and key algorithms, including how you would handle model updates and aggregation, given the constrained mobile environment.", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 4}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1741", "title": "Federated LLM Personalization on Google Tensor G3: Scaling and Non-IID Challenges", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you 'size' the overall system to handle the expected scale and data characteristics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1742", "title": "Federated Learning Optimization for Cross-Device Personalization on Edge NPUs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you reduce federated learning communication delay, staleness, and non-IID effects on Exynos 2400 devices?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 3}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1743", "title": "Federated Learning on Apple A17 Pro: Optimizing for Non-IID Data and Communication Efficiency", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a FedAvg system for the A17 Pro considering its 35 TOPS, 5W budget, and non-IID data challenges?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1744", "title": "DP-SGD and Privacy Budgeting on Snapdragon NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do epsilon, delta, noise calibration, and the privacy-utility tradeoff apply to DP-SGD on Snapdragon Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1745", "title": "DP-SGD on Mobile NPU: Securing User Data for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you implement differentially private federated learning for a MobileNetV2 health model on the mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1746", "title": "DP-SGD on Apple A17 Pro: Privacy-Utility Tradeoff Diagnosis", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose poor DP-SGD recommendation utility while preserving an annual epsilon budget of 8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1747", "title": "On-Device DP-SGD for Federated Learning on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the noise calibration mechanism and manage the privacy-utility tradeoff given NPU constraints?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1748", "title": "DP-SGD on Google Tensor G3 for On-Device Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calibrate clipping norms and Gaussian noise for DP-SGD under on-device memory and compute limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1749", "title": "Optimizing DP-SGD on Snapdragon 8 Gen 3 Hexagon NPU for Mobile Privacy", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize DP-SGD clipping and noise generation on a mobile NPU without weakening privacy guarantees?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1750", "title": "Designing a Fair AI System on Edge with Samsung Exynos 2400 NPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the architecture and evaluation methodology to achieve this, considering the computational and memory constraints of the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1751", "title": "On-Device Fairness Evaluation for Image Classification on Apple A17 Pro", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the key resource considerations (TOPS, power, memory) for designing a practical, on-device evaluation strategy to continuously monitor demographic parity without negatively impacting user experience?", "chain_ids": ["mobile-chain-auto-secondary-013-17"], "chain_positions": {"mobile-chain-auto-secondary-013-17": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1752", "title": "Diagnosing Bias in NPU-Accelerated Facial Verification for Mobile", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What metrics would you collect, and what steps would you take to systematically pinpoint the root cause of this observed unfairness?", "chain_ids": ["mobile-chain-auto-secondary-013-18"], "chain_positions": {"mobile-chain-auto-secondary-013-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1753", "title": "Designing a Fair and Efficient On-Device ML System for Content Moderation on Google Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify a fair, efficient Tensor G3 content moderation model within 50ms latency and 200MB footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1754", "title": "Fairness Evaluation of On-Device Facial Recognition on Apple A17 Pro", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an evaluation plan to compare these two architectures for fairness, explicitly considering the constraints and capabilities of the target device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1755", "title": "On-Device Fairness Evaluation Architecture for Facial Recognition on Snapdragon Hexagon NPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an architecture and methodology for continuously monitoring and reacting to potential demographic parity violations on-device within memory constraints?", "chain_ids": ["mobile-chain-auto-secondary-013-18"], "chain_positions": {"mobile-chain-auto-secondary-013-18": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1756", "title": "On-device Fairness Optimization for Google Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and optimize Tensor G3 hardware bottlenecks that worsen subgroup fairness under high load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1757", "title": "Bias Mitigation on Edge AI: NPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What metrics would you prioritize, and how would you adapt your evaluation approach to measure intersectional fairness given a 12 GB memory and 34.7 TOPS edge constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1759", "title": "Responsible AI Governance for On-Device LLM on Samsung Exynos 2400 NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might the model still exhibit biases on-device, and how do mobile NPU constraints complicate Responsible AI governance frameworks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1760", "title": "On-Device Responsible AI Guardrail Overhead on Snapdragon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the additional INT8 TOPS and average memory bandwidth required by this guardrail?", "chain_ids": ["mobile-chain-auto-secondary-011-13"], "chain_positions": {"mobile-chain-auto-secondary-011-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1761", "title": "Bias Detection in On-Device LLM on Tensor G3: Diagnosing a Fairness Regression", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose a post-deployment gender-bias regression in an on-device Tensor G3 summarization LLM?", "chain_ids": ["mobile-chain-auto-secondary-011-15"], "chain_positions": {"mobile-chain-auto-secondary-011-15": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1762", "title": "On-Device Responsible AI: Content Moderation Guardrails on Samsung Exynos 2400 NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design responsible AI architecture and governance for Exynos 2400 on-device video content moderation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1763", "title": "Responsible AI Evaluation on Snapdragon 8 Gen 3 for On-Device Sentiment Analysis", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate two mobile sentiment models for Responsible AI, governance, fairness, and resource impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1764", "title": "Realizing Responsible AI on Google Tensor G3: On-Device Content Moderation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Tensor G3 on-device content moderation system handle guardrails, impact assessment, and resource limits?", "chain_ids": ["mobile-chain-auto-secondary-011-15"], "chain_positions": {"mobile-chain-auto-secondary-011-15": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1765", "title": "Optimizing Responsible AI Guardrails on Samsung Exynos NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize Exynos 2400 responsible AI guardrails to reduce total inference latency from 350ms back down to the 200ms target?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1766", "title": "Designing Responsible AI for On-Device LLM Deployment on Apple A17 Pro", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a Responsible AI framework for an A17 Pro on-device LLM handling sensitive personal data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1767", "title": "Token-Level Safety Classifier Bottleneck", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 200M INT8 safety classifier cut Gemini Nano generation from 20 to 5 tokens/sec on Tensor G3?", "chain_ids": ["mobile-chain-auto-secondary-009-01"], "chain_positions": {"mobile-chain-auto-secondary-009-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1768", "title": "Tensor G3 TPU Memory Bandwidth Bottleneck", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the TPU compute capacity severely underutilized, and what physical characteristic constrains the generation rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1769", "title": "On-Device Coreset Selection for Gemini Nano", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does random pruning cause model collapse under these hardware constraints, and what is the tradeoff of switching to gradient-based coreset selection?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1770", "title": "Analyzing NAS Latency on A17 Pro", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can an A17 Pro NAS model with 30% fewer MACs have 40% higher latency, and how would you analyze it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1771", "title": "Data Validation Gate Memory Bottleneck", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does an INT8 autoencoder gate plus 1 GB main model miss a 33.3 ms video deadline on Snapdragon 8 Gen 3?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1780", "title": "Roofline Comparison: CPU vs Neural Engine for LLM on A17 Pro", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does roofline analysis guide the choice between CPU and Neural Engine for a batch-1 INT4 7B LLM decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1781", "title": "Attention Head Pruning for On-Device Transformer on Snapdragon 8 Gen 3", "topic": "attention-scaling", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the latency and accuracy trade-offs of pruning 50% of attention heads in a 6-layer 12-head transformer on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1782", "title": "Dynamic Quantization vs Static Quantization for Transformer on Tensor G3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you choose between dynamic and static quantization for a Tensor G3 assistant model with variable activation ranges?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1783", "title": "Unified Memory Architecture Impact on LLM Decode on A17 Pro", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much does UMA contention slow 7B INT4 LLM decode under display, camera, and background bandwidth load?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 1}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1784", "title": "Batch Size 1 Optimization for Interactive LLM on Snapdragon 8 Gen 3", "topic": "batching-strategies", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the utilization gap and propose how to increase decode throughput?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1785", "title": "Neural Engine Power Efficiency vs CPU for Transformer on A17 Pro", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much energy does a 50M-FLOP inference use on A17 Pro Neural Engine, E-cores, and P-cores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1786", "title": "Core ML Palettization vs GGUF Quantization for LLM on iPhone", "topic": "model-serving-infrastructure", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does GGUF Q4_K_M reach 15 tok/s while Core ML palettization reaches 12 tok/s for Llama-3-8B on iPhone 15 Pro?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1787", "title": "Layer-wise Relevance Propagation for Pruning Sensitivity on Snapdragon 8 Gen 3", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning on MobileViT-XS yield only a 15% latency reduction on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1788", "title": "Thermal Aware Inference Scheduling on iPhone During Photo Processing", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should a photo ML task schedule 5s of A17 Pro Neural Engine work during 4K30 capture at thermal state 2?", "chain_ids": ["mobile-chain-auto-secondary-013-23"], "chain_positions": {"mobile-chain-auto-secondary-013-23": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1789", "title": "On-Device KV Cache Memory Budget for LLM", "topic": "attention-scaling", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV-cache memory does the 2B INT4 LLM use at 3000 tokens with 28 layers, 16 KV heads, and FP16 cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1790", "title": "OOM Diagnosis for LLM Context Extension on A17 Pro", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the memory growth pattern and identify what causes the OOM at 2048 tokens?", "chain_ids": ["mobile-chain-auto-014-20"], "chain_positions": {"mobile-chain-auto-014-20": 1}, "chain_tiers": {"mobile-chain-auto-014-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1791", "title": "Multi-Task Model Specification for On-Device AI Assistant on A17 Pro", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which model sizes, quantization levels, and execution units fit ASR, intent, and LLM generation in 1.5 GB on iPhone 15 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1793", "title": "Grouped Query Attention Memory Impact for Long Context on Snapdragon 8 Gen 3", "topic": "attention-scaling", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV cache size does GQA use for a 16K-context Mistral-7B, and how does it compare with MHA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1794", "title": "On-Device Training with LoRA Adapter on A17 Pro — Feasibility Analysis", "topic": "pruning-sparsity", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is LoRA fine-tuning a 3B INT4 iOS language model with rank 8 adapters and 100 examples feasible on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1795", "title": "Mixed Precision Training Impact on Convergence for Mobile NLP Model", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why might mixed-precision BERT training for Tensor G3 hit NaN loss at step 1200 despite loss scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1796", "title": "Attention vs Convolution Roofline for Mobile Vision Transformer", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do EfficientViT MSA and FFN blocks reach 32 TOPS and 8 TOPS on a 45 TOPS Snapdragon NPU, and what optimizations apply to each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1797", "title": "Energy-Delay Product Optimization for Inference on A17 Pro", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate energy-delay product (EDP) for each mode and determine when each is optimal?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1798", "title": "Adaptive Bitrate Quantization Based on Thermal State on Snapdragon 8 Gen 3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Design the transition logic with hysteresis to allow switching among INT8, INT4, and INT2 thermal states without oscillating.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1801", "title": "Mobile Secure Aggregation Fan-In", "topic": "federated-learning", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many secure-aggregation validation shards are needed to process 80,000 mobile updates within 4 seconds?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1805", "title": "Shared Mmap vs Heap Allocation for Multi-Process Edge LLM", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate the tradeoffs between heap allocation and shared memory-mapped (mmap) file access, and determine which architecture is required for this system.", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1806", "title": "Implementing Shared mmap for Edge LLMs", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much RAM does shared read-only mmap save for two Jetson Orin processes loading the same 15 GB INT8 LLM, and which flags are required?", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1807", "title": "CoreML vs ONNX Runtime on Apple A17 Pro Neural Engine", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you explain and reduce the 50% latency overhead of ONNX Runtime versus direct CoreML on an A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-016-19"], "chain_positions": {"mobile-chain-auto-secondary-016-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1808", "title": "Cross-Platform Mobile Inference: TFLite GPU Delegate on Android vs CoreML on iOS", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which iOS and Android runtimes meet <20 ms style transfer on A17 Pro and Snapdragon 8 Gen 3 from one model codebase?", "chain_ids": ["mobile-chain-auto-secondary-016-19"], "chain_positions": {"mobile-chain-auto-secondary-016-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1809", "title": "Metal Performance Shaders Graph for On-Device Model Compilation on iOS", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should an A17 Pro diffusion model use CoreML or MPSGraph when attention-heavy layers limit ANE support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1810", "title": "Snapdragon QNN SDK vs TFLite for Hexagon NPU Utilization", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does QNN run MobileNetV3-large at 1.8ms versus 3.2ms for TFLite Hexagon on Snapdragon 8 Gen 3, and which should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1811", "title": "Portable On-Device LLM Inference Across Mobile Platforms", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you serve a 3B LLM at over 10 tok/s on both A17 Pro and Snapdragon 8 Gen 3 without exceeding mobile memory?", "chain_ids": ["mobile-chain-auto-secondary-016-19"], "chain_positions": {"mobile-chain-auto-secondary-016-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-016-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1812", "title": "Android NNAPI Delegation Failures and Fallback Performance", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the model run 9x slower on the Exynos NNAPI despite the delegate being active?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1813", "title": "WebAssembly SIMD as a Universal Mobile Inference Backend", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate whether WASM SIMD can replace native mobile inference for a MobileNetV2 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1814", "title": "Thermal Throttling Impact on Portable Model Benchmarks", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do mobile benchmarks deteriorate, and how does this affect portability comparisons?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1815", "title": "Portable Feature Extraction Pipeline: Camera Input to Model Input", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would one preprocessing interface convert iOS BGRA and Android YUV_420_888 frames into a float tensor efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1816", "title": "Unified Model Versioning for Multi-Platform Mobile Deployment", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should a mobile team manage 12 models, 3 quality tiers, and monthly updates without version drift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1817", "title": "On-Device Interconnect Bottleneck for NPU Inference", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the NoC bandwidth, not the NPU's TOPS rating, determine achievable throughput for LLM inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1818", "title": "SoC Die-to-Die Interconnect for Heterogeneous Inference", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the layer-to-accelerator mapping that minimizes cross-accelerator data transfer?", "chain_ids": ["mobile-chain-auto-secondary-017-09"], "chain_positions": {"mobile-chain-auto-secondary-017-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1819", "title": "Chiplet Interconnect Bandwidth for Multi-Model Mobile", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate total memory bandwidth demand and determine if the NoC is the bottleneck?", "chain_ids": ["mobile-chain-auto-secondary-017-09"], "chain_positions": {"mobile-chain-auto-secondary-017-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1820", "title": "On-Device Differential Privacy for Mobile ML", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 100M users each contributing one data point per day, what epsilon provides meaningful utility, and how does LDP accuracy compare to central DP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1821", "title": "DP Noise Calibration on Mobile NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the standard deviation of Gaussian noise you must add to the clipped gradient (norm=1.0), and how much memory does the noise generation require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1822", "title": "Differential Privacy Impact on On-Device Model Size", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With batch size 32, how much memory do per-sample gradients consume, and does this fit alongside the model and activations within the OS app limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1833", "title": "Understanding why autograd is disabled on Core ML and how to replicate gradient-based effects", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How can a mobile app support few-shot adaptation when inference frameworks do not provide backward passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1834", "title": "Minimizing computational graph size for on-device fine-tuning on Snapdragon 8 Gen 3", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you shrink the autograd graph for LoRA fine-tuning a 1B LLM?", "chain_ids": ["mobile-chain-auto-secondary-016-15"], "chain_positions": {"mobile-chain-auto-secondary-016-15": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1835", "title": "Implementing memory-efficient backprop for on-device RL on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you compute PPO old-policy ratios without keeping two full autograd graphs in memory?", "chain_ids": ["mobile-chain-auto-secondary-016-15"], "chain_positions": {"mobile-chain-auto-secondary-016-15": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1836", "title": "Debugging gradient vanishing in a mobile RNN model on Snapdragon 8 Gen 3", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you diagnose and fix vanishing recurrent gradients in a 500-step GRU trained natively on a mobile accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1837", "title": "Optimizing Core ML conversion of custom autograd operations on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you convert a custom sparse attention autograd operation to Core ML for NPU deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1838", "title": "Efficient gradient-based hyperparameter search within 8GB on Snapdragon", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you make DARTS architecture search fit in 8GB on Snapdragon without running the full supernet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1839", "title": "Profiling and eliminating unnecessary tensor copies in mobile autograd pipeline", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you profile and remove unnecessary tensor copies in a PyTorch Mobile image pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1840", "title": "Implementing gradient-based input attribution for on-device explainability on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement Integrated Gradients for on-device explainability on A17 Pro without 50 sequential passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1841", "title": "Managing the autograd tape for battery-constrained on-device learning on Snapdragon", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you manage autograd and update scheduling so background personalization does not drain battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1842", "title": "Designing a two-phase autograd pipeline for on-device neural style transfer on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you structure neural style transfer so only the input image needs gradients, keeping VGG efficient enough to beat a 60-second target?", "chain_ids": ["mobile-chain-auto-secondary-016-15"], "chain_positions": {"mobile-chain-auto-secondary-016-15": 2}, "chain_tiers": {"mobile-chain-auto-secondary-016-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1843", "title": "Chiplet Compute-to-Memory Ratio for Mobile SoC Design", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you explain the design rationale for matching compute-to-memory ratio to target workloads?", "chain_ids": ["mobile-chain-auto-secondary-016-16"], "chain_positions": {"mobile-chain-auto-secondary-016-16": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1844", "title": "Apple A17 Pro Die Architecture and ANE Integration", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does the A17 Pro's Neural Engine integrate with the CPU and GPU at the die level, and why does this differ from external accelerator chiplets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1845", "title": "Snapdragon 8 Gen 3 Hexagon NPU Memory Architecture", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the memory hierarchy to explain the 40ms decode latency for a 1B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1846", "title": "UCIe Standard and Future Mobile Chiplet Integration", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can UCIe integrate a third-party ML accelerator into Snapdragon within mobile power, package, and latency limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1847", "title": "Thermal Throttling Impact on Mobile Chiplet Burst Performance", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How large is A17 Pro thermal throttling after two minutes of 30 FPS object detection, and what mitigation fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1848", "title": "DPO for Mobile On-Device Alignment: Feasibility Analysis", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate whether DPO or PPO is feasible and calculate the minimum memory requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1849", "title": "LoRA Inference Optimization on A17 Pro Neural Engine", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does Core ML reject dynamic LoRA on A17 Pro, and how should the adapters be compiled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1850", "title": "On-Device Personalization with Differential Privacy on Mobile", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a differential privacy mechanism for rank-4 on-device LoRA training on the A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-016-18"], "chain_positions": {"mobile-chain-auto-secondary-016-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1851", "title": "Quantized LoRA Serving on Snapdragon 8 Gen 3", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should BF16 LoRA adapters run on Snapdragon 8 Gen 3 when Hexagon supports only INT8/INT4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1852", "title": "Multi-Adapter Session Management on Mobile", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design the session management system to efficiently switch between 5 LoRA adapters on an A17 Pro with 8 GB RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1853", "title": "Privacy Budget Allocation for On-Device Fine-Tuning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a per-epoch budget allocation strategy that maximizes model utility while respecting the lifetime budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1855", "title": "Guardrail Latency Budget on a Mobile LLM", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which guardrail mix fits a 200 ms safety budget after a 600 ms prefill on an NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1856", "title": "Algorithmic Accountability for On-Device Health Decisions", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a responsible AI governance framework that operates within mobile compute constraints and does not transmit raw health data off-device?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1857", "title": "Chiplet ISA Heterogeneity and ML Framework Portability", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you diagnose the ONNX Runtime portability issue for operators falling back to the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1858", "title": "Unified Memory Coherency for Mobile Chiplet Camera Pipeline", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and fix unified-memory cache coherency stalls between an ISP die and NPU die in a camera pipeline?", "chain_ids": ["mobile-chain-auto-secondary-016-16"], "chain_positions": {"mobile-chain-auto-secondary-016-16": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1859", "title": "Multi-Chiplet Thermal Throttle Recovery for Mobile Gaming", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain NPU throttling in a mobile chiplet gaming workload and redesign the thermal schedule?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1860", "title": "On-Device LLM Tokenizer Bottleneck on Heterogeneous Chiplet", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and fix tokenization and KV cache setup adding 900ms to a mobile Gemma 2B prompt on a heterogeneous chiplet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1861", "title": "Chiplet Die-Count Impact on Mobile SoC PCB Routing", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you justify moving a chiplet mobile SoC board from 6 to 8 PCB layers for routing, power, and EMI risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1862", "title": "On-Device LoRA Training: Gradient Checkpointing on Mobile", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does LoRA training for a 1B model OOM at batch 1 on an 8GB phone, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1863", "title": "Per-User LoRA Adapter Privacy on Mobile", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you assess the risk and design mitigations?", "chain_ids": ["mobile-chain-auto-secondary-016-18"], "chain_positions": {"mobile-chain-auto-secondary-016-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1864", "title": "Adapter Selection Latency in Multi-Persona Mobile Assistant", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you reduce mobile assistant LoRA adapter selection and loading latency from 200ms to under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1865", "title": "RLHF-Lite: Implicit Feedback Collection for Mobile Fine-Tuning", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a lightweight on-device RLHF pipeline that runs without a dedicated reward model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1866", "title": "LoRA Adapter Compatibility Across Model Version Updates", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Phi-3 Mini v1.0 LoRA adapters degrade for 15% of users after a v1.1 base-model update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1870", "title": "A17 Pro NPU Video Frame Drops", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the required frame buffer size K such that the M/M/1/K finite-queue blocking probability P_K is strictly less than 1%.", "chain_ids": ["mobile-chain-auto-secondary-014-27"], "chain_positions": {"mobile-chain-auto-secondary-014-27": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1871", "title": "Federated Learning Update Window", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can the weight update complete within the OS background window without further compression?", "visual": {"kind": "svg", "path": "mobile-1871.svg", "alt": "Node diagram showing mobile app attempting to push 10 MB through a 2 Mbps 4G pipe to a cloud aggregator.", "caption": "Federated Update Transmission"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1872", "title": "A17 Pro LLM Concurrent Queue Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Evaluate the maximum average request arrival rate (requests per second) the NPU can handle while ensuring the average queue wait time does not exceed 1.6 seconds.", "visual": {"kind": "svg", "path": "mobile-1872.svg", "alt": "Curve showing queue wait time increasing as the arrival rate approaches the NPU capacity of 2.5 requests per second.", "caption": "M/D/1 Wait Time Analysis"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1873", "title": "Mobile LoRA Atomic Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum LoRA adapter size that can be atomically saved within a 50ms OS kill warning window, and what strategy ensures safety?", "visual": {"kind": "svg", "path": "mobile-1873.svg", "alt": "Timeline showing OS kill warning at T=0, 50ms write operation to a temp file, followed by an atomic rename.", "caption": "Atomic Save within OS Warning Window"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1874", "title": "Split Computing BLE Bandwidth", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum allowable intermediate tensor size to ensure the BLE transmission adds no more than 100ms of latency?", "visual": {"kind": "svg", "path": "mobile-1874.svg", "alt": "Directed graph showing model layers on a watch, a BLE transmission bottleneck, and final layers on a phone.", "caption": "Watch-to-Phone Split Architecture"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1875", "title": "Snapdragon Segmentation Video Uplink", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What optimization to the binary mask transmission architecture ensures the frame push stays within the 5 Mbps bandwidth limit?", "visual": {"kind": "svg", "path": "mobile-1875.svg", "alt": "Graph showing raw segmentation masks passed through an RLE encoder before hitting the cellular uplink bottleneck.", "caption": "Segmentation Mask Uplink Optimization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1876", "title": "A17 Pro NVMe Intermediate State Checkpoint", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the storage latency to write the checkpoint and determine if it is faster to recompute the frame (recompute takes 15ms).", "visual": {"kind": "svg", "path": "mobile-1876.svg", "alt": "A bar chart comparing the 15ms recompute time against the combined 8ms read/write checkpointing time.", "caption": "Recompute vs Checkpoint Latency"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1877", "title": "Snapdragon LPDDR5 Weight Bound", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the absolute minimum latency to evaluate a single dense linear layer containing 10 million weights.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1879", "title": "WASM ML Model Fetch Over 3G", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total download time and determine the optimal chunk size to provide an interactive loading bar updating 10 times a second.", "visual": {"kind": "svg", "path": "mobile-1879.svg", "alt": "Diagram showing a monolithic 20MB model being sliced into 50KB chunks for streaming over a slow network.", "caption": "WASM Model Chunking"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1880", "title": "A17 Pro 3B Model Sequence Limit", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum sequence length achievable before the KV cache exceeds the strict 2.0 GB OS memory budget?", "visual": {"kind": "svg", "path": "mobile-1880.svg", "alt": "A bar chart breaking down the 2.0GB memory limit into 1.5GB of INT4 weights and 0.5GB of remaining KV cache space.", "caption": "iOS 2.0GB App Memory Budget"}, "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 5}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1881", "title": "5G Model Update Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a specification of per-stage rates for the four-stage Cloud -> 5G modem -> Crypto/UFS -> NPU load pipeline, identify the binding bottleneck under the 64 MB ring constraint, and determine whether the 3-minute end-to-end SLA is feasible. Quantify the pipeline fill+drain cost added by the downstream NPU mapping stage.", "visual": {"kind": "svg", "path": "mobile-1881.svg", "alt": "A linear pipeline diagram showing data flowing left-to-right from the Cloud through the 5G modem, the Crypto/UFS stage, and into the phone's NPU.", "caption": "Model download and local transfer pipeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1882", "title": "A17 NPU Deterministic Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the average queue depth and total latency per token, treating the system as an M/D/1 queue.", "visual": {"kind": "svg", "path": "mobile-1882.svg", "alt": "Line chart comparing M/M/1 vs M/D/1 wait times.", "caption": "M/M/1 vs M/D/1 delay comparison."}, "chain_ids": ["mobile-chain-auto-secondary-014-25"], "chain_positions": {"mobile-chain-auto-secondary-014-25": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1883", "title": "Snapdragon Split Compute Margin", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the minimum network bandwidth required to match the local NPU speed of 40 TOPS if the operations represent 12 T-Ops total?", "visual": {"kind": "svg", "path": "mobile-1883.svg", "alt": "Diagram showing task split between local NPU and Cloud.", "caption": "Split computation latency paths."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1884", "title": "A17 LLM Context Cap", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the memory pressure by computing the peak KV cache size and determine if it exceeds a strict 1 GB application limit.", "visual": {"kind": "svg", "path": "mobile-1884.svg", "alt": "Bar chart comparing 1 GB cache size against the 1 GB limit.", "caption": "KV Cache capacity limit."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1885", "title": "Snapdragon Camera Pipelining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum throughput (fps) and end-to-end latency if only the ISP and NPU can run concurrently in the pipeline architecture layout?", "visual": {"kind": "svg", "path": "mobile-1885.svg", "alt": "Gantt chart showing parallel pipeline stages bottlenecked by 10ms NPU.", "caption": "Concurrent pipeline stages."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1886", "title": "A17 Background Queue Deep State", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create an optimized scheduling limit to ensure the probability of having more than 3 tasks in the queue remains under 15%.", "visual": {"kind": "svg", "path": "mobile-1886.svg", "alt": "Line chart showing queue probabilities decaying exponentially.", "caption": "Probability of N tasks in the system."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1887", "title": "Snapdragon KV Memory Limit", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the maximum sequence length supported under this 500 MB footprint constraint?", "visual": {"kind": "svg", "path": "mobile-1887.svg", "alt": "Bar chart comparing token context counts and their memory consumption.", "caption": "Token limit given a 500MB constraint."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1888", "title": "FedAvg Wi-Fi Star vs Ring", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the synchronization time using a Star topology (one device acts as the parameter server) versus a Ring topology.", "visual": {"kind": "svg", "path": "mobile-1888.svg", "alt": "Star topology vs Ring topology comparison.", "caption": "Star topology for Federated Averaging."}, "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1889", "title": "Vision Framework Pipelining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply pipeline engineering to determine the latency to process the 10th frame under full saturation.", "visual": {"kind": "svg", "path": "mobile-1889.svg", "alt": "Horizontal bars representing CPU, GPU, and NPU stages overlapping.", "caption": "Video frame pipeline stages."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1890", "title": "NPU-to-CPU Fallback Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply the M/M/1 infinite-queue tail approximation to estimate the fraction of arrivals routed to the CPU, and note where this approximation overestimates versus the exact M/M/1/5 blocking formula.", "visual": {"kind": "svg", "path": "mobile-1890.svg", "alt": "Line graph showing the 23.7% tail of the probability distribution above N=5.", "caption": "Probability of CPU Fallback."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1891", "title": "A17 OS Termination Grace", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Starting from the figure's 75 ms naive estimate, design the realistic checkpoint budget by adding (a) the CPU serialization tax across ~200 tensors, (b) UFS contention with the concurrent media-cache flush, and (c) the overlap between CPU serialization and storage I/O. Identify the binding stage, propose a parallelization strategy across the A17's performance cores, and confirm whether the 1.5 s grace window holds.", "visual": {"kind": "svg", "path": "mobile-1891.svg", "alt": "Gantt showing 75ms save fitting easily within 1500ms grace period.", "caption": "State Save vs Grace Period."}, "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 4}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1893", "title": "Mobile AR Frame Queuing", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does the average queuing delay of an M/M/1 model meet a strict 100ms SLA for AR frames?", "visual": {"kind": "svg", "path": "mobile-1893.svg", "alt": "A hockey-stick curve plotting average delay against arrival rate, showing exponential growth as utilization nears 100%.", "caption": "M/M/1 Queuing Delay"}, "chain_ids": ["mobile-chain-auto-secondary-014-27"], "chain_positions": {"mobile-chain-auto-secondary-014-27": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1895", "title": "A17 Pro NPU Keyword Spotting", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Describe the duty cycle percentage and compute the total active execution time the NPU spends over a full 1-minute period.", "visual": {"kind": "svg", "path": "mobile-1895.svg", "alt": "A timeline depicting brief 5ms high states followed by 45ms low states repeating regularly.", "caption": "NPU Duty Cycle (10%)"}, "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1896", "title": "Snapdragon Cache Hit Effective BW", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why the bar-chart arithmetic-mean reading of 92 GB/s overstates the true effective bandwidth, and compute the correct time-weighted (harmonic-mean) effective bandwidth.", "visual": {"kind": "svg", "path": "mobile-1896.svg", "alt": "Side-by-side bar chart contrasting 100 GB/s L3 cache bandwidth with 60 GB/s main memory, with stacked bars showing the 80% / 20% hit/miss split that produces the 88.2 GB/s effective bandwidth.", "caption": "Cache vs Main Memory Bandwidth weighted by 80/20 hit ratio"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1899", "title": "Mobile NLP Pipeline Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the critical path bottleneck and calculate the absolute maximum throughput in tokens per second for this pipeline.", "visual": {"kind": "svg", "path": "mobile-1899.svg", "alt": "A bar chart of stage durations highlighting the 10ms NPU Transformer block as significantly taller than the rest.", "caption": "Pipeline Stage Durations"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1901", "title": "Snapdragon Always-On Energy", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply the duty cycle to calculate the total energy consumed strictly in Joules over exactly 1 hour of operation.", "visual": {"kind": "svg", "path": "mobile-1901.svg", "alt": "A duty cycle timeline showing power hovering at 2mW and spiking to 50mW for 10% of every period.", "caption": "Listening Model Energy Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1902", "title": "A17 Pro Off-Chip Spilling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an allocation strategy to minimize DRAM bandwidth waste, and compute the arithmetic intensity required to avoid bottling on RAM.", "visual": {"kind": "svg", "path": "mobile-1902.svg", "alt": "Bar chart comparing Apple A17 Pro SRAM bandwidth to LPDDR5 system memory bandwidth.", "caption": "Memory Hierarchy Bandwidth on Mobile SoC"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1903", "title": "Mobile AR Pipelining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Starting from the figure's sequential 30.3 FPS baseline, evaluate what steady-state FPS double-buffering would unlock given the non-overlappable 1 ms DMA penalty, identify the binding stage, and quantify the FPS uplift.", "visual": {"kind": "svg", "path": "mobile-1903.svg", "alt": "Gantt chart showing sequential CPU then NPU execution per frame.", "caption": "Sequential Processing Pipeline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1904", "title": "TTS Chunk Queue Wait", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected wait time in the queue for a new text chunk using M/M/1 formulas?", "visual": {"kind": "svg", "path": "mobile-1904.svg", "alt": "Plot showing wait time rapidly increasing as arrival rate approaches service rate.", "caption": "Wait Time vs Arrival Rate"}, "chain_ids": ["mobile-chain-auto-secondary-014-27"], "chain_positions": {"mobile-chain-auto-secondary-014-27": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1905", "title": "5G Uplink Conversion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the bandwidth limit of the 5G connection in megabytes per second to determine if a 15 MB image can be offloaded in under 1 second.", "visual": {"kind": "svg", "path": "mobile-1905.svg", "alt": "Node diagram showing mobile device connected to cloud via a 100 Mbps link.", "caption": "Mobile Offload Link"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1907", "title": "Hexagon Flash Wear-out", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the tradeoff between checkpointing frequency and flash memory wear-out for this mobile application processing at 30 FPS?", "visual": {"kind": "svg", "path": "mobile-1907.svg", "alt": "Line graph showing cumulative GBs written over time growing extremely rapidly at 30 FPS.", "caption": "Flash Wear-Out Accumulation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1909", "title": "Mobile NPU Wake-Word Duty-Cycling Overhead", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the average power consumption including a 2ms wake-up transient overhead that draws 200mW.", "visual": {"kind": "svg", "path": "mobile-1909.svg", "alt": "Timeline showing active, sleep, and transient power spikes across multiple 20ms periods.", "caption": "Duty-Cycling Power Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1910", "title": "Mobile NPU SRAM Spilling Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the added latency bottleneck per layer transition caused by spilling and reloading this 16MB tensor.", "visual": {"kind": "svg", "path": "mobile-1910.svg", "alt": "Horizontal bar chart showing the massive bandwidth difference between internal SRAM and external LPDDR5.", "caption": "Memory Tier Bandwidth"}, "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 2}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1911", "title": "Context Switching Mobile Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What architectural state must be saved to RAM to ensure the vision model can resume precisely without data corruption?", "visual": {"kind": "svg", "path": "mobile-1911.svg", "alt": "Timeline showing active NPU state, an interrupt spike, context save overhead, and context restore.", "caption": "Preemptive Context Switch"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1912", "title": "INT4 Quantization Bandwidth Impact", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "By what factor are memory bandwidth requirements reduced, and what is the secondary cache effect?", "visual": {"kind": "svg", "path": "mobile-1912.svg", "alt": "Bar chart comparing FP16 footprint spilling out of SRAM vs INT4 footprint fitting entirely inside.", "caption": "Cache Footprint Reduction"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1913", "title": "Federated Learning Disconnection Resilience", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain the state preservation strategy required to resume training immediately when connectivity is restored.", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1914", "title": "Zero-Copy Video Pipeline on Apple Silicon", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose an asynchronous pipeline utilizing iOS unified memory primitives to prevent CPU copy bottlenecks.", "visual": {"kind": "svg", "path": "mobile-1914.svg", "alt": "Block diagram showing sequential pointer handoffs instead of data copies between ISP, GPU, and NPU.", "caption": "Zero-Copy IOSurface Pipeline"}, "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 4}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1915", "title": "Tiered Sensor Hub Data Batching", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a tiered duty-cycling system utilizing the always-on sensor hub to minimize main CPU wakeups.", "visual": {"kind": "svg", "path": "mobile-1915.svg", "alt": "Graph showing the sensor hub buffer filling slowly and dropping sharply when the main CPU wakes to drain it.", "caption": "Sensor Hub Buffer Fill and Drain"}, "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1916", "title": "Heterogeneous Data Exchange on Apple Silicon", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Identify the primary memory architecture used to exchange multi-megabyte tensors between the NPU and GPU without copying.", "visual": {"kind": "svg", "path": "mobile-1916.svg", "alt": "Architecture diagram showing CPU, GPU, and NPU all pointing directly to a shared unified memory block.", "caption": "Apple Unified Memory Architecture"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1917", "title": "A17 Pro NPU Activation Tiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you create a memory allocation specification to pipeline UNet layer activations, avoiding spilling to slower system RAM when the peak intermediate tensor size is 120MB and the NPU SRAM is roughly 32MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1918", "title": "Snapdragon NPU Duty-Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the integrated energy-vs-latency trade-off of continuous vs batched duty-cycling policies?", "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1919", "title": "A17 Pro LLM Quantization Protocol", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an evaluation protocol to determine if 4-bit block-wise quantization with FP16 activations provides a strictly better latency-accuracy tradeoff than pure INT8 quantization on the Neural Engine.", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 3}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1920", "title": "Snapdragon NPU/GPU Cache Thrashing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a diagnostic plan using performance counters to pinpoint L3 cache thrashing between the NPU and GPU, and propose a tiling fix.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1922", "title": "Android NLP Service Optimization", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create an optimized serving architecture using TFLite Native C++ API and Android's bound services to keep the model weights in memory and reduce cold start latency.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1923", "title": "Android OOM Gradient Resumption", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a fine-grained gradient checkpointing mechanism to the local flash storage that minimizes write amplification while allowing resume from the exact mini-batch.", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 5}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1924", "title": "A17 Pro CPU-NPU Pipelining", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Implement a producer-consumer queue that pipelines CPU pre-processing of frame N+1 with NPU inference of frame N, then derive the realized throughput from the per-stage timing.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1925", "title": "Hardware FIFO Sensor Batching", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a hardware wake-up FIFO strategy to buffer accelerometer data, batching inference every 5 seconds instead of waking the CPU/NPU on every step.", "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1926", "title": "CoreML INT8 Calibration", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply post-training static quantization using a calibration dataset to convert FP32 weights and activations to INT8, ensuring compatible operations for the ANE.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1927", "title": "NPU INT4 Unpacking Penalty", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does an INT4 LLM run slower than an INT8 LLM on an NPU despite requiring half the memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1929", "title": "A17 NPU Wake Break-Even for Activity Recognition", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the per-second energy accounting to compute the break-even rate for always-on vs rate-gating, and what is the best rate?", "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1932", "title": "Calculate model memory reduction accounting for static KV cache", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the total memory savings factor when switching weights from FP16 to INT4, distinguishing the weight-only reduction from the system-level reduction once the static KV cache footprint is included.", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 1}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1933", "title": "Evaluate shared memory bandwidth contention during mobile task overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate how perfectly overlapping the NPU and GPU workloads will affect the AR frame latency.", "chain_ids": ["mobile-chain-auto-secondary-016-17"], "chain_positions": {"mobile-chain-auto-secondary-016-17": 2}, "chain_tiers": {"mobile-chain-auto-secondary-016-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1935", "title": "Explain backpressure effects on queue arrival rates during mobile bursts", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain what happens to the arrival rate of the queue when the buffer fills up completely during a sync burst.", "chain_ids": ["mobile-chain-auto-secondary-014-26"], "chain_positions": {"mobile-chain-auto-secondary-014-26": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-26": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1936", "title": "Name tightly coupled memory used for local NPU accelerator caching", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the name of the ultra-fast memory situated closest to the NPU processing elements used to avoid fetching weights repeatedly from main LPDDR memory?", "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 0}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1938", "title": "Symmetric INT8 Quantization on Apple A17 Pro", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply symmetric INT8 quantization to this weight tensor, calculating the scale factor and the quantized integer value for a weight of 1.25?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1939", "title": "CoreML Execution Targets on Apple Silicon", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the three main hardware execution targets that the CoreML framework abstracts for dynamically routing model operations?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1941", "title": "Super-Resolution Compute Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the required compute for 1080p at 30 FPS, and does the NPU have capacity headroom to also support a 4K mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1942", "title": "PhotoKit in On-Device ML Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary function of the iOS PhotoKit framework when used as the data ingestion layer for an on-device ML pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1943", "title": "Federated Learning Flash Memory Wear", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why must saving model updates to flash memory during federated training be carefully balanced against storage wear out?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 1}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1944", "title": "Split-Computing Compression Overhead", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Diagnose why total end-to-end latency increases when compressing feature maps before 5G transmission, despite reducing payload size.", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1945", "title": "Diagnosing Low NPU Utilization", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the primary architectural reason why the model might achieve less than 10% of theoretical peak performance.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1946", "title": "Zero-Copy Video Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Identify the data movement bottleneck if frames are passed as UIImages, and explain how to eliminate it.", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 2}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1949", "title": "Cortex-X4 vs Hexagon TOPS/W on a 10 GOPS Workload", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Compute realized TOPS/W (GOPS/W) for each placement at the 10 GOPS demand point, identify which is more efficient, and explain the architectural source of the gap.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1950", "title": "On-Device LLM Bandwidth Bound", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the theoretical upper bound for auto-regressive token generation speed assuming it is entirely memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1952", "title": "Mobile KV-Cache Constraint", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the KV-cache memory footprint for FP16 and determine how many 4096-token sessions can fit within the 2GB allocation.", "visual": {"kind": "svg", "path": "mobile-1952.svg", "alt": "Stacked bar showing 3.5GB Weights and 2.0GB KV cache", "caption": "RAM Allocation Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1953", "title": "NPU Thermal Energy Bounds", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Evaluate the thermal impact over a 1-hour session by computing total NPU energy consumed.", "visual": {"kind": "svg", "path": "mobile-1953.svg", "alt": "Square wave showing 5ms active peaks and 11.6ms idle valleys", "caption": "NPU Power Duty Cycle over 1 Frame"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1954", "title": "Voice Translation Queue Spike", "topic": "queueing-theory", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the root cause of massive latency lag using queueing theory, computing expected queue length in steady state.", "visual": {"kind": "svg", "path": "mobile-1954.svg", "alt": "Line graph showing queue length shooting up at 0.95 utilization", "caption": "Queue Length vs Utilization"}, "chain_ids": ["mobile-chain-auto-secondary-014-26"], "chain_positions": {"mobile-chain-auto-secondary-014-26": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1956", "title": "Agentic LLM Prefix Cache", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you optimize the KV-cache memory bandwidth usage across multiple consecutive turns?", "visual": {"kind": "svg", "path": "mobile-1956.svg", "alt": "Bar chart showing large shared prefix block and small individual generation blocks", "caption": "Prefix Caching Memory Layout"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1957", "title": "On-Device Async Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you implement an asynchronous checkpointing mechanism that bounds data loss to 10 seconds without slowing the training loop?", "visual": {"kind": "svg", "path": "mobile-1957.svg", "alt": "Timeline showing compute phase overlapping with flash write phase in a separate thread", "caption": "Asynchronous Thread Checkpointing"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1958", "title": "INT8 Context Window Doubling", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how KV-cache quantization (FP16 to INT8) impacts memory limits and maximum sequence length.", "visual": {"kind": "svg", "path": "mobile-1958.svg", "alt": "Bar chart comparing 3GB total RAM for FP16 vs 2GB total RAM for INT8", "caption": "RAM Footprint: FP16 vs INT8 KV Cache"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1959", "title": "P2P Bluetooth Sync", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does it take to exchange the 1MB embedding using point-to-point Bluetooth?", "visual": {"kind": "svg", "path": "mobile-1959.svg", "alt": "Two nodes connected by a bidirectional arrow labeled 2 Mbps", "caption": "Point-to-Point Bluetooth Exchange"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1961", "title": "Random Read Memory Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the effective memory bandwidth given the cache miss penalty?", "visual": {"kind": "svg", "path": "mobile-1961.svg", "alt": "Bar chart showing L2 BW, DRAM BW, and the resulting Effective BW being near DRAM", "caption": "Effective Bandwidth Degradation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1962", "title": "Federated Tree Aggregation", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you devise a hierarchical aggregation topology to handle the massive uplink traffic and reduce central server bottlenecking?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1964", "title": "On-Device Checkpoint RPO/RTO", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate a checkpointing schedule to maintain a 5-minute RPO and a 10-second RTO.", "visual": {"kind": "svg", "path": "mobile-1964.svg", "alt": "Timeline displaying training intervals, a 5-minute RPO window, a failure event, and a 10-second RTO recovery.", "caption": "RPO and RTO timeline for on-device training."}, "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 3}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1965", "title": "Mobile 7B LLM Memory Tiers", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify which memory tier acts as the primary bandwidth bottleneck during single-batch token generation.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1966", "title": "Mobile KV Cache Quantization", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Does INT8 KV cache quantization improve or regress net per-token decode latency on Hexagon given the dequantization compute overhead?", "visual": {"kind": "svg", "path": "mobile-1966.svg", "alt": "Bar chart showing INT8 accommodating double the context length of FP16 under a 500MB budget.", "caption": "Context length capacity under a 500MB KV Cache budget."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1967", "title": "Voice Assistant Bursty Queue", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why do tail latency spikes occur under bursty arrivals compared to uniform arrivals despite the same average utilization?", "visual": {"kind": "svg", "path": "mobile-1967.svg", "alt": "Two queueing curves: bursty traffic shows latency spiking at much lower utilization than uniform traffic.", "caption": "Impact of arrival burstiness on wait time."}, "chain_ids": ["mobile-chain-auto-secondary-014-26"], "chain_positions": {"mobile-chain-auto-secondary-014-26": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1972", "title": "A17 Pro Ring AllReduce Bounds", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical lower bound communication volume sent per device for a single AllReduce pass over a 10MB payload.", "visual": {"kind": "svg", "path": "mobile-1972.svg", "alt": "Four nodes connected in a unidirectional ring.", "caption": "Ring topology for decentralized mobile communication."}, "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1977", "title": "Mesh AllGather vs Central Server", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the architectural advantage of an AllGather operation in this mesh compared to a central parameter server?", "visual": {"kind": "svg", "path": "mobile-1977.svg", "alt": "A fully connected mesh of nodes showing symmetrical links.", "caption": "Symmetrical Mesh Topology for Distributed AllGather."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1978", "title": "Hexagon NPU TCM Spilling Impact", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the performance impact of missing the NPU's TCM and spilling memory traffic to main memory.", "visual": {"kind": "svg", "path": "mobile-1978.svg", "alt": "Bar chart displaying a massive latency spike when missing TCM and hitting LPDDR5x.", "caption": "Access Latency: TCM vs LPDDR5x Spilling."}, "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 3}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1979", "title": "A17 Pro ISP to NPU Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Determine the maximum achievable frame rate if the ISP and NPU execute sequentially without pipelining.", "visual": {"kind": "svg", "path": "mobile-1979.svg", "alt": "Throughput bar chart showing 2ms ISP write followed by 3ms NPU read.", "caption": "Sequential data pipeline latency stages."}, "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 0}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1980", "title": "Mobile Federated Sync Overhead", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the time required to sync the model update to the cloud?", "visual": {"kind": "svg", "path": "mobile-1980.svg", "alt": "Diagram showing mobile device transmitting data to cloud server.", "caption": "Mobile to Cloud uplink bottleneck."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1981", "title": "Snapdragon Zero-Copy Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the copy latency and overall execution time without zero-copy optimization for a 16 MB image?", "visual": {"kind": "svg", "path": "mobile-1981.svg", "alt": "Bar chart showing a small 0.2ms copy overhead before a 2ms inference.", "caption": "Memory copy overhead in the data pipeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1982", "title": "A17 Pro AR Burst Processing", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Starting from the figure's 300 ms naive estimate, specify the realistic worst-case time for the queue to return to its pre-burst depth, including the cold-cache tax, worst-case variance, and the open-loop arrival stream during drainage.", "visual": {"kind": "svg", "path": "mobile-1982.svg", "alt": "Queue length graph starting at 15 and linearly draining to 0 over 300ms.", "caption": "AR Burst queue drain over time."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1983", "title": "Mobile FL Tree Aggregation", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the total data received by the root node in a balanced binary tree, compared to a flat star topology.", "visual": {"kind": "svg", "path": "mobile-1983.svg", "alt": "Tree structure showing root node with two children, cascading down.", "caption": "Binary tree aggregation reducing root bottleneck."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1985", "title": "A17 Pro Unified Memory Limit", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If each token in the KV cache requires 1 MB of memory, what is the maximum context length supported?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1986", "title": "NPU TCM Memory Tiling", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify a memory tiling and scheduling strategy to process this layer locally without spilling intermediate activations to main mobile DRAM.", "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 4}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1989", "title": "Zero-Copy Image Pipeline", "topic": "data-pipeline-engineering", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an evaluation metric for the memory pipeline between the ISP, GPU, and NPU, and propose a unified zero-copy buffer architecture.", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 4}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1990", "title": "Tiered Sensor Gating", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a multi-tiered duty-cycling strategy to extend battery life using the ambient light sensor and ISP motion detection.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1991", "title": "QAT for Text-to-Speech", "topic": "quantization-fundamentals", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an optimization plan to recover acoustic fidelity without dropping back to FP16, addressing the dynamic range issues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1993", "title": "Voice Translation Pipelining", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you implement an execution pipeline to process overlapping 40ms audio chunks concurrently, keeping per-chunk latency under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1995", "title": "ARKit Priority Queuing Tail Latency", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the tail latency for Hand Tracking under (a) FIFO vs (b) priority-with-preemption queueing, including the OS preemption cost and the post-12-minute thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1996", "title": "Diffusion Model Memory Bound Check", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate total MACs per step and implement an arithmetic intensity check to prove whether generation is compute-bound or memory-bound.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1997", "title": "Mobile NPU versus GPU Memory Bandwidth Tradeoffs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the A17 Pro Neural Engine or the GPU handle KV cache updates for a 1B parameter INT8 model to maximize throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1998", "title": "Mobile Neural Processing Unit Adaptive Batching Queues", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Model the processing pipeline as an M/D/1 queue and formulate an adaptive batching strategy to keep 99th percentile latency under 500ms when user speech rate unexpectedly doubles.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1999", "title": "SRAM Tiling versus DRAM Fetch on Mobile Chips", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy and latency trade-offs of tiling the feature maps to fit in SRAM versus relying on unified memory DRAM spills for a 1080p frame.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2002", "title": "Zero-Copy Memory Sharing Between Mobile CPU and NPU", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the overhead of context switching and memory movement between the CPU and NPU, and propose a shared-memory strategy using Android NNAPI.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2005", "title": "Thermally Constrained Background Data Pipelines on Mobile Cores", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate a data pipeline that tokenizes SQLite text messages incrementally without causing thermal throttling or waking up the high-performance CPU cores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2007", "title": "Mobile FL Checkpoint Wear", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the trade-off between checkpointing frequency and flash storage wear-out for this interrupted mobile training job.", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 4}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2013", "title": "Mobile I/O Network Throttling", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does network degradation cause the local Neural Engine inference to stagger and drop frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2014", "title": "iOS Memory Pressure Kills", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does synchronous checkpointing of the 100MB model state during the system's memory-warning interrupt lead to corrupted or lost progress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2016", "title": "Snapdragon Checkpoint Resume", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the resume-time overhead of JSON versus memory-mapped binary checkpoint formats. Identify the binding cost and quantify the resume-time difference.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2017", "title": "NPU Batching Jitter", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does enforcing a fixed batch size constraint on the NPU queue exacerbate tail latency (jitter) for streaming real-time audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2018", "title": "LIFO Voice Assistant Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Design a queueing discipline that prioritizes perceived latency upon recovery, and explain its impact on the user experience.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2019", "title": "SRAM Tiling for 4K Video Upscaling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate the caching strategy and calculate the minimum SRAM required to hold one horizontal tile strip allowing for a 3x3 receptive field overlap.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2020", "title": "Mobile NPU Polling Power Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the power inefficiency and calculate the average power consumption of this NPU polling pattern.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2021", "title": "Mobile AR FP16 Memory Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the required memory bandwidth for the model weights alone and calculate what percentage of total system bandwidth this consumes.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2022", "title": "Zero-Copy Video Pipeline Bandwidth Saving", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a zero-copy pipeline strategy and calculate the exact memory bandwidth saved per second compared to a deep-copy moving frames into an NPU-specific buffer.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2025", "title": "A17 SLC-Resident Mixed-Precision Optimisation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the mixed-precision allocation that fits the 26 MB SLC budget, honors per-layer precision floors, and maximizes throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2026", "title": "NPU Translation Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Determine the realistic sustained throughput in sentences per second assuming the empirical 65% NPU utilization, and contrast against the 100%-utilization theoretical upper bound.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2027", "title": "Audio Buffer Downsampling Accumulation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Implement the structural downsampling logic and calculate exactly how many 1024-sample CoreAudio buffers must be accumulated to run one ML inference.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2028", "title": "Facial Mesh Target FLOPS Budget", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the required compute budget in GFLOPs/s under both nominal INT8 operation AND the mixed-precision fallback mode. Determine whether the NPU still hits 120 FPS in fallback when GPU composition is active.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2029", "title": "Identifying Bandwidth Bottlenecks in Mobile Video Processing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Identify the likely bottleneck in the data pipeline and calculate the required memory bandwidth to sustain 60 FPS for uncompressed 4K (3840x2160, 3 channels, 8-bit) input.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2030", "title": "Designing Hierarchical Wake-Up Pipelines for Mobile NPU", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a duty-cycled pipeline using a low-power CPU threshold to trigger the Neural Engine, quantifying the expected power savings.", "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2031", "title": "Evaluating Memory Bandwidth in Mobile INT8 Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the per-token decode latency under FP16 vs INT8 weights, accounting for a 50 GB/s shared LPDDR5 bus. Identify whether INT8 fully recovers the 2x throughput it provides on a contention-free bus.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2032", "title": "Analyzing Mobile Thermal Throttling using D/D/1 Queues", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate how the system queue length and frame dropping behavior changes during a 1-second thermal throttling burst.", "chain_ids": ["mobile-chain-auto-secondary-014-25"], "chain_positions": {"mobile-chain-auto-secondary-014-25": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2033", "title": "Understanding Android NNAPI Initialization Overheads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Describe the NNAPI initialization steps that run on the main thread, identify which step causes the ANR, and propose the threading fix to ensure the UI remains responsive.", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2034", "title": "Analyzing Thermal and Power Impacts of Mobile Batching", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does duty-cycling 100 images over 2 seconds impact the thermal envelope compared to continuously processing 1.6 images per second?", "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2035", "title": "Understanding Zero-Copy Unified Memory on Apple Silicon", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does zero-copy unified memory avoid CPU<->ANE copies, and what alignment conditions cause silent fallbacks to memcpy?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 0}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2036", "title": "Evaluating JIT Compilation Latency Spikes on Mobile NPU", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the primary cause of this first-run latency spike and propose an architectural solution.", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 4}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2041", "title": "Audio Pipelining to Reduce First-Word Latency", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can you apply a temporal pipelining technique to significantly reduce the user-perceived latency of the first translated word?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2043", "title": "Doubling Battery Life via VAD Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the continuous runtime, and what is the required duty cycle for a 1mW VAD to double the battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2050", "title": "On-Device Personalization Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you create an execution architecture that performs this update without violating iOS battery and thermal background constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2053", "title": "Mobile LLM Bandwidth Bound", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum model size (in billions of parameters) to ensure a guaranteed generation speed of 15 tokens per second?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2054", "title": "Evaluate Preprocessing Offload for Thermal Headroom", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do the trade-offs of migrating preprocessing to the ISP's scaling blocks compare to reducing the initial camera sensor resolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2056", "title": "Pipeline Design for Streaming Multimodal Outputs", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a pipelined software architecture that hides the Bluetooth transmission latency and ensures seamless UI text streaming.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2057", "title": "Memory Bandwidth Bound for Mobile Autoregression", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum memory bandwidth required to read the weights during generation, and can an Apple A17 Pro support it?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2060", "title": "Hierarchical Wake-ups Using Mobile Coprocessors", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how cascading inference through the low-power motion coprocessor can heavily gate the NE and extend battery life.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2061", "title": "Analyze WakeLock Interaction with Mobile OS Sleep", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how Android's Doze mode reacts to exact repeating WakeLocks, and how aligning inferences to OS maintenance windows prevents this drain.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2063", "title": "Design Strict Memory Bound LoRA Fine-Tuning", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What memory flow architecture permits gradient calculation for LoRA adapters without loading 4GB base weights into mutable app memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2064", "title": "Bursty AR Frame Pipeline Analysis", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does this bursty arrival pattern impact the 99th percentile frame processing latency compared to a perfectly uniform arrival rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2065", "title": "NPU Memory Tiling for Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory tiling architecture that partitions the query, key, and value matrices into the NPU's localized SRAM to minimize LPDDR5 fetch requests.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2066", "title": "Hexagon NPU Compute Utilization", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the NPU compute utilization percentage if the pipeline processes a fixed 60 frames per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2067", "title": "Hexagon Audio Batching Energy", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy consumed per minute by batching 5 snippets and waking up every 5 seconds, versus waking up instantly for every snippet.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2068", "title": "Image Resizing Memory Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the absolute minimum system memory read and write bandwidth consumed purely by this resizing step for a single frame.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2069", "title": "Federated Checkpoint Thermal Impact", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do frequent heavy writes to non-volatile storage affect the System on Chip (SoC) thermal budget and background task execution?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 3}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2070", "title": "Neural Engine and GPU Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the total end-to-end latency for one frame versus the steady-state pipeline throughput if the ANE and GPU execute asynchronously?", "chain_ids": ["mobile-chain-auto-secondary-016-17"], "chain_positions": {"mobile-chain-auto-secondary-016-17": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2071", "title": "Pedometer Power Calculation", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption of this step-detection cycle.", "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2074", "title": "Estimating Maximum Frame Rate on A17 Pro Neural Engine", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming 40% utilization of the peak TOPS, what is the maximum frame rate the NPU can sustain?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2075", "title": "CPU versus NPU Image Format Conversion Performance", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain the performance impact of performing the YUV-to-RGB conversion and planar transposition on the CPU versus an image signal processor (ISP) or NPU.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2076", "title": "Optimal Checkpointing Frequency for Mobile Federated Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Design a local checkpointing frequency that limits lost training progress to at most 20 seconds, given saving the model takes 50ms of flash write time.", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 0}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Every 1 second", "Every 20 seconds", "Every 200 seconds", "Only when the user manually saves"], "correct_index": 1}}, {"id": "mobile-2077", "title": "Unified Memory Contention Between NPU and GPU Execution", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why aggressively pipelining the NPU inference and GPU rendering might cause frame drops despite sufficient theoretical compute capacity for both.", "chain_ids": ["mobile-chain-auto-secondary-016-17"], "chain_positions": {"mobile-chain-auto-secondary-016-17": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2078", "title": "Pipelining Model Weights Loading in Mobile Voice Translation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how to deploy both models to minimize the 300ms total loading latency during a real-time conversation.", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2079", "title": "LLM KV Cache Impact on Unified Memory System Cache", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does the unified memory architecture impact the system level cache (SLC) hit rate and overall memory power during a long sequence generation?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 0}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2081", "title": "Pipelining Autoregressive Token Generation on Snapdragon NPU", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum token generation rate if weight loading and compute are perfectly pipelined?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2083", "title": "Wake-up Energy Penalty in NPU Micro-power Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the energy consumed to process a 10 ms batch of data from sleep, through wake-up, to completion.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2084", "title": "Overlapping Compute and Network on Snapdragon", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How can the mobile client overlap the transmission of layer N's activations with the computation of layer N+1 or subsequent frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2087", "title": "Hardware Image Scaling for Mobile NPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you optimize the data pipeline to downsample the image without burdening the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2088", "title": "Compute Latency Under Thermal Burst Limits", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the expected inference time and thermal implications if the NPU can sustain 70% utilization before throttling after 2 seconds.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2089", "title": "Evaluating Pipeline Sync Barriers on Snapdragons", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the pipeline's overall throughput and bottleneck if a strict memory synchronization barrier is added between every frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2090", "title": "Designing Continuous Batching for Shared NPUs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a serving scheduler that maximizes NPU utilization and minimizes context switching overhead for heterogeneous batch sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2091", "title": "Calculating LLM Token Rates by Memory Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum theoretical token generation rate for batch size 1 under shared memory constraints, assuming 80 GB/s total bandwidth and 20 GB/s for display?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 3}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2092", "title": "Applying Accelerators for Zero-Copy Image Resizing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can the iOS image signal processor be used to bypass the CPU for hardware-accelerated 4K to 224x224 image resizing?", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 3}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2094", "title": "Analyzing Latency of Cold-Swapping NPU SRAM", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the latency implications of cold-swapping these models into the NPU's SRAM versus co-residing them in main memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2096", "title": "Applying iOS Jetsam Footprint Limits to LLMs", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do memory management principles explain what happens when the app footprint approaches system limits?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 1}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2097", "title": "Multimodal Pipeline Interleaving", "topic": "data-pipeline-engineering", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an optimized zero-copy data pipeline architecture that interleaves visual and audio ML workloads without hitting memory bandwidth bottlenecks.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2098", "title": "Transformer INT8 Outliers", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the activation distributions to pinpoint the root cause of the accuracy degradation and propose a hardware-compatible mitigation.", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 2}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2099", "title": "AR Throttling Queue Control", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a dynamic queue management algorithm to maintain real-time UI responsiveness and prevent queue explosion during thermal throttling events.", "chain_ids": ["mobile-chain-auto-secondary-014-25"], "chain_positions": {"mobile-chain-auto-secondary-014-25": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2100", "title": "4K Image Memory Tiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify the memory allocation sequence and tile sizes to process the 4K image without thrashing main memory.", "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 1}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2103", "title": "Uncompressed Frame Bandwidth", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the memory bandwidth required just to transport the converted RGB frames to the NPU.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2105", "title": "NPU Audio Duty Cycle", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If the NPU wakes up for 200ms every 1 second, calculate the average power consumption.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2106", "title": "Symmetric vs Asymmetric MACs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the impact of asymmetric vs symmetric quantization on the latency of the model's depthwise convolutional layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2108", "title": "A17 Super-Resolution Compute", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical minimum execution time for a single image, isolating purely computational limits.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2109", "title": "Federated Learning OOM Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-off between checkpointing frequency and OOM recovery overhead to minimize total wasted training time.", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 2}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2111", "title": "Always-On Gaze Detection", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the maximum supported frame rate to keep average NPU power strictly under 5mW.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2119", "title": "PagedAttention Fragmentation on Mobile", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the FP16 KV cache size per page, and what is the memory fragmentation overhead during a typical 15-token generation?", "chain_ids": ["mobile-chain-bucket-kvcachem-05"], "chain_positions": {"mobile-chain-bucket-kvcachem-05": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2121", "title": "Hexagon-Adreno Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the pipeline bubble and end-to-end latency if the NPU stage takes 12ms, the GPU takes 8ms, and the synchronization barrier over shared memory takes 2ms?", "chain_ids": ["mobile-chain-auto-secondary-017-47"], "chain_positions": {"mobile-chain-auto-secondary-017-47": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-47": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2123", "title": "Wi-Fi Direct MAC Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the pipeline latency per token, factoring in a 4ms fixed MAC contention overhead per transmission?", "chain_ids": ["mobile-chain-auto-secondary-017-48"], "chain_positions": {"mobile-chain-auto-secondary-017-48": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-48": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2125", "title": "Wearable SPI DMA Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Determine the pipeline efficiency if the SPI DMA setup takes an additional 1ms synchronization barrier per frame.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2126", "title": "Video SR Memory Barrier", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the first-frame latency penalty caused by a 5ms synchronization barrier?", "chain_ids": ["mobile-chain-auto-secondary-017-47"], "chain_positions": {"mobile-chain-auto-secondary-017-47": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-47": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2127", "title": "Multi-Device AR Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a scheduling strategy to minimize the pipeline bubble given the strict network constraints.", "chain_ids": ["mobile-chain-auto-secondary-017-48"], "chain_positions": {"mobile-chain-auto-secondary-017-48": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-48": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2128", "title": "Drone Mesh Pipelining", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a micro-batch pipeline schedule that masks the 30ms mesh multi-hop synchronization delay, ensuring GPU utilization remains >80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2130", "title": "AirDrop Half-Duplex Sync", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the total time required for the AllGather, incorporating a 20ms session negotiation latency and Wi-Fi Direct's half-duplex constraint.", "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2131", "title": "Automotive Ethernet AllGather", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the ring AllGather latency for an 8MB tensor per node across 3 nodes over Gigabit Ethernet, accounting for a 50us switch processing delay per hop and a 1ms synchronization barrier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2132", "title": "5G Hierarchical Tree FL", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the effective bandwidth and total tree-reduce time per round, factoring in a 500ms synchronization wait for the 95th percentile straggler.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2133", "title": "LoRa Swarm Gossip", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the expected convergence time for a single round, given a 200ms CSMA/CA backoff penalty per transmission and a 3-hop network diameter.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2134", "title": "MoE AllToAll Imbalance", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the AllToAll latency for a 5MB batch of tokens, given an uneven expert load (75% tokens to node A, 25% to node B) and a 0.5ms TCP setup barrier.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2136", "title": "BLE Gossip Quantization", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose an asynchronous quantization scheme that prevents the 30ms BLE connection interval from bottlenecking the 2Hz update requirement.", "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2137", "title": "A17 Paged Cache Fragmentation", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the maximum sequence length supported before spilling to flash, given 128KB KV per token and a 1ms memory controller synchronization per page allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2138", "title": "NPU DMA Ring Buffer KV", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory bandwidth and DMA overhead are consumed by reading a 128MB KV cache for a 1024-token context at 10 tok/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2139", "title": "Orin Cache Line Fetch", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Design a block sizing strategy to minimize the 5us synchronization overhead per block fetch while keeping memory fragmentation under 10% for a 500-token chat.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2140", "title": "Wi-Fi 6 MAC Bubble Inference", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the network stall time induced by sending a 2MB KV cache update per generation step, accounting for Wi-Fi 6's 3ms MAC layer synchronization bubble?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2141", "title": "Wearable BLE KV Offload", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the maximum allowable context window before the 15ms BLE connection synchronization latency and transfer time exceed the 100ms real-time audio budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2142", "title": "A17 Sliding Window Sync", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the effective memory bandwidth utilization if rolling the 128MB KV cache requires a 500us synchronization barrier between the CPU and NPU.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2143", "title": "Orin RDMA Cache Lock", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can a distributed KV cache allocation protocol mitigate the 10us RDMA lock synchronization cost when multiple Orins append to the same sequence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2144", "title": "5G Remote KV Prefetching", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What speculative prefetching mechanism for the remote KV blocks can hide the 15ms 5G synchronization latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2145", "title": "Mobile mmap: Loading Models from Flash Storage", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 2GB INT8 mmap model take 4s on first inference from UFS 3.1, and how should it be warmed?", "chain_ids": ["mobile-chain-auto-secondary-014-05"], "chain_positions": {"mobile-chain-auto-secondary-014-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2147", "title": "Model Format Conversion: Sizing the FP16 CoreML Payload", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does the FP16 conversion mathematically impact the model's storage footprint, and what is the expected payload size of the resulting CoreML model?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 1}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2151", "title": "KV-Cache: Diagnose Latency Spikes in Dynamic Paged Allocation", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do these severe latency spikes and OOM kills occur at allocation boundaries despite available memory, and what mobile OS dynamic drives this behavior?", "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2152", "title": "Android SoC NPU KV Cache Size Estimation", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you calculate the maximum number of tokens the KV cache can hold before exhausting the remaining shared memory?", "chain_ids": ["mobile-chain-bucket-kvcachem-02"], "chain_positions": {"mobile-chain-bucket-kvcachem-02": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-02": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2154", "title": "The Infotainment Traffic Jam", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the architectural root cause of these unpredictable latency spikes, and what system-level mitigations would you deploy to guarantee the SLA?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2157", "title": "The Depthwise Cache Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Structurally, why does a depthwise convolution eliminate the L1 cache reuse present in a standard convolution, causing its arithmetic intensity to plummet into the memory-bound region of the Roofline model?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 2}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2161", "title": "Race-to-Sleep vs. Paced Execution for Mobile LLMs", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which power management strategy yields a lower overall thermal load for this continuous generative workload, and how do Dynamic Voltage and Frequency Scaling (DVFS) principles justify your choice?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0001", "title": "The Memory Collision", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the SRAM layout, how is your 'stable' tensor arena being corrupted?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0002", "title": "The RF Energy Sink", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the RF breakdown, why is BLE consuming so much more than a simple packet transmission?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0003", "title": "The Quantization Blur", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Based on quantization mechanics, why does an INT8 conversion completely destroy the model's discriminative boundaries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0004", "title": "The CPU Cycle Thief", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which 100 kHz I2C ISR workload steals enough cycles to break a 40 ms KWS model with a 50 ms deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 3}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0005", "title": "The Arena Swiss Cheese", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the arena map, why can't the system satisfy a 30 KB request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0006", "title": "The Jitter Storm", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can an 8 ms BLE event make a 23 ms gesture task miss a 25.6 ms hard deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0007", "title": "The Ghost Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Based on the clock diagram, why is a single crystal failing to keep them aligned?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0008", "title": "The Observation Gap", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a forest device listening 5 seconds per minute miss over 90% of random 1-second bird calls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0009", "title": "The Memory-Mapped Weight Corruption", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the memory map, how did a data bug destroy your model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0010", "title": "The SPI Bus Latency Choke", "topic": "extreme-quantization", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does XIP from external serial flash make a 240 MHz CPU run inference 10x slower than simulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0021", "title": "The OTA Flash Memory Tax", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much flash must be reserved specifically for the OTA update partition?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~50 KB, for storing a small patch file.", "Effectively 0 KB, as you can overwrite the existing application in-place.", "~500 KB, to hold a complete second copy of the application binary.", "~32 KB, the space taken by the bootloader itself."], "correct_index": 2}}, {"id": "tinyml-0023", "title": "The Duty Cycle Power Drain", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What average power does a Cortex-M4 wake-word sensor draw when active 1 s and asleep 9 s per cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10 mW", "~5 mW", "~1 mW", "~10 µW"], "correct_index": 2}}, {"id": "tinyml-0027", "title": "The Deep Sleep Power Chasm", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate ratio of power consumption for a typical microcontroller in an active state versus a deep sleep state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100×", "~1,000×", "~10,000×", "~1,000,000×"], "correct_index": 2}}, {"id": "tinyml-0028", "title": "The Remote Wildlife Camera's Lifespan", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Given the Cortex-M4's active power consumption is 50mW and its deep sleep power is 10µW, approximately how long will the battery last?", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 1}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~12 days", "~60 days", "~1,800 days", "~5,500 days"], "correct_index": 2}}, {"id": "tinyml-0037", "title": "The Wildlife Camera's Battery Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the operational lifetime of the device and what is that lifetime?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 1.2 days", "About 20 days", "About 73 days", "About 2.5 days"], "correct_index": 2}}, {"id": "tinyml-0038", "title": "The Keyword Spotter's Battery Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power and battery life result from checking 0.5-second audio clips every 5 seconds on a 720 mWh battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~18 hours", "~36 hours", "~7.5 days", "~2.5 days"], "correct_index": 2}}, {"id": "tinyml-0040", "title": "The Birdwatcher's Power Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power comes from 1 s active at 10 mW and 59 s sleep at 10 uW on a Cortex-M4 bird sensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10.0 mW", "0.59 mW", "0.177 mW", "0.010 mW"], "correct_index": 2}}, {"id": "tinyml-0044", "title": "The Battery-Powered Birdwatcher", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many days will an 888 mWh battery last when a Cortex-M4 wildlife camera wakes 6 times per hour for 1 s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~18 hours", "~4 days", "~397 days", "~444 days"], "correct_index": 2}}, {"id": "tinyml-0046", "title": "The Energy-Neutral Wildlife Camera", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum active time per 60-second cycle for a 1 mW solar-powered camera to remain energy-neutral?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 0}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.20 seconds", "~58.81 seconds", "~1.19 seconds", "0.012 seconds"], "correct_index": 2}}, {"id": "tinyml-0047", "title": "The Wildlife Camera's Power Budget: Duty Cycling & Energy Harvesting", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you compute average power for a wildlife camera from its active and deep-sleep states?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.67 mW", "~9.7 mW", "~1.34 mW", "~25 mW"], "correct_index": 2}}, {"id": "tinyml-0048", "title": "The Wildlife Sensor's Power Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power does the wildlife sensor draw with 2 s active at 40 mW and 18 s sleep at 10 µW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~13.0 mW", "~4.01 mW", "~4.90 mW", "~4.00 mW"], "correct_index": 1}}, {"id": "tinyml-0050", "title": "The Duty Cycle Constraint", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the correct term for the percentage of time the CPU is in an active state processing audio versus in a low-power sleep state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Latency", "Throughput", "Duty Cycle", "Power Draw"], "correct_index": 2}}, {"id": "tinyml-0052", "title": "The Bird-Call Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power does the bird-call sensor draw with 2 s at 50 mW and 58 s at 10 µW each minute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.67 mW", "~11.33 mW", "~1.68 mW", "~100.6 mW"], "correct_index": 2}}, {"id": "tinyml-0053", "title": "The FOTA Update Risk", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should 10,000 nodes receive a 200 KB model over 250 B/s LoRaWAN without bricking on failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0054", "title": "Model Versioning on MCU", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can a fleet of RP2040 sensors expose ML model versions remotely without an underlying OS or filesystem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0056", "title": "BLE Throughput for Model Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long and how much battery does a 150 KB BLE 5.0 model update cost on a 100 mAh wearable?", "chain_ids": ["tinyml-chain-auto-secondary-004-11"], "chain_positions": {"tinyml-chain-auto-secondary-004-11": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0058", "title": "The Offline Drift Detector", "topic": "monitoring-observability", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you detect and handle model drift on a device with 256 KB SRAM and no internet?", "chain_ids": ["tinyml-chain-auto-secondary-017-67"], "chain_positions": {"tinyml-chain-auto-secondary-017-67": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-67": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0059", "title": "FOTA Update Integrity Verification", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is verifying the binary hash insufficient for ML models, and how do you implement functional model attestation (inference on a golden test input) to prove the model's math is intact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0061", "title": "Inference Result Compression for Upload", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compress daily RP2040 bird-classifier uploads to cut a 1,000-device cellular bill from about $13,000/month?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0062", "title": "Bootloader A/B Firmware Partitioning", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the flash memory layout for a 1 MB flash footprint to support A/B firmware partitioning with rollback?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0063", "title": "Fleet-Wide Model Update Strategy", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should 100,000 sensors across 5 MCU variants and BLE, LoRaWAN, and LTE-M receive a retrained model update?", "chain_ids": ["tinyml-chain-auto-secondary-004-11"], "chain_positions": {"tinyml-chain-auto-secondary-004-11": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0064", "title": "Hardware-in-the-Loop Testing", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many HIL boards test 5 Cortex-M variants in CI, and what wall time does each commit need?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0068", "title": "The OTA Update Brickening", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why did a 20 KB tensor-arena increase brick 10% of OTA-updated sensors despite sufficient total free memory and dual partitions?", "chain_ids": ["tinyml-chain-auto-secondary-004-12"], "chain_positions": {"tinyml-chain-auto-secondary-004-12": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0071", "title": "The MCU Model Extraction Attack", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you architect a defense-in-depth security strategy to protect the model on a constrained $3 MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0072", "title": "The Flash Extraction Attack", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can power side-channel analysis extract model weights by correlating power traces with MAC operations, and why does the model's arithmetic structure make this ML-specific attack possible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0073", "title": "Secure Boot Chain for ML Models", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an STM32U5 TrustZone wearable authenticate both firmware and cardiac model weights at boot?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 2}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0076", "title": "The Continuous Logging Flash Death", "topic": "vram-budgeting", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How did logging 20 bytes every 5 minutes destroy the Flash?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0079", "title": "The Audio Buffer Memory Footprint", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "If you're sampling at a standard 16 kHz with a 16-bit depth, what is the approximate size of the raw audio buffer you need to allocate in SRAM just to hold one clip for processing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.2 KB", "320 bytes", "32 KB", "320 KB"], "correct_index": 2}}, {"id": "tinyml-0080", "title": "The Sensor Data Ingestion Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much SRAM is required to buffer exactly 1 second of 16 kHz, 16-bit mono audio for TinyML wake-word inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 KB", "64 KB", "32 KB", "256 KB"], "correct_index": 2}}, {"id": "tinyml-0081", "title": "The Keyword Spotting Latency Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you budget a 500ms keyword-spotting response deadline across audio capture, feature extraction, inference, and action?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0082", "title": "The BLE Disconnect During OTA", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does a 200 KB ML model size determine BLE OTA transfer time, and why is it riskier than firmware without incremental checksums?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0084", "title": "Watchdog Timers and Hard Real-Time Guarantees", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why must a watchdog timeout be based on worst-case ML execution time rather than average inference latency?", "chain_ids": ["tinyml-chain-auto-secondary-008-06"], "chain_positions": {"tinyml-chain-auto-secondary-008-06": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0086", "title": "The DMA Ping-Pong Desync", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a 1024-sample I2S ping-pong buffer corrupt audio when inference takes 65 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0087", "title": "The FPU Register Thrashing", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does converting ADC values to float inside a 1 kHz Cortex-M4F ISR spike RTOS latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0088", "title": "The Sensor Pipeline Without Drops", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 1 kHz vibration pipeline process 256-sample windows with 50 ms inference without dropping samples?", "chain_ids": ["tinyml-chain-auto-secondary-003-13"], "chain_positions": {"tinyml-chain-auto-secondary-003-13": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0092", "title": "The I2C Bus Lockup", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does a long ML inference blocking the I2C ISR cause the sensor to clock-stretch indefinitely, and how does the model's layer execution time determine the maximum safe I2C timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0094", "title": "The Power Supply ADC Jitter", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the ADC introduce broadband noise above 2 kHz, and how does it drop the F1-score to 0.71?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0097", "title": "Side-Channel Attacks on MCU Inference", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can an attacker extract a Cortex-M4 model utilizing Differential Power Analysis (DPA) despite RDP Level 2 flash protection?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 1}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0098", "title": "Power Profiling for MCU Inference", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What did you miss in your datasheet-based estimate, and how do you build an accurate power profile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0099", "title": "Power Profiling Methodology", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a measurement setup to get the real power profile, and what did the team likely miss?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0100", "title": "Watchdog Timer Integration with Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an 85 ms nRF5340 inference randomly trip a 100 ms watchdog, and where should the watchdog be kicked?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 2}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0105", "title": "Vibration-Based Predictive Maintenance", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a vibration-based predictive maintenance system for 500 industrial motors using Cortex-M4 sensors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0107", "title": "Sensor Aging Changes the Baseline — Detecting and Adapting On-Device", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you detect this is sensor drift (not real anomalies) and adapt on-device without retraining?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0108", "title": "The Hardware Crypto Engine Latency", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an hourly TLS handshake freeze the CPU and cause a Cortex-M4 with hardware AES to drop 20 ms audio frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0110", "title": "The Heterogeneous MCU Scheduling Problem", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which i.MX RT1170 core should run the 2M-MAC vibration classifier and the 180 KB-state temperature LSTM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0111", "title": "The Interrupt Latency Impact", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you calculate the effective inference time under interrupt load, and at what sensor sampling rate does the system miss the 30ms inference deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 4}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0112", "title": "The Power Supply Noise Impact", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do 50 mV regulator ripple and 30 mV inference droop degrade a 12-bit gas-sensor ADC's SNR?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0115", "title": "The Sleep Mode Wi-Fi Disconnect", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does ESP32 Light Sleep keep Wi-Fi powered yet still force a 3 s DHCP reconnect after motion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0117", "title": "The Flash Erase Suspend Lockout", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 300 ms SPI flash erase be safely suspended every 5 ms for a motor-control interrupt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0118", "title": "Solar + Supercapacitor + MCU System Design", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you bridge the 300x gap between harvest rate and compute demand to run inferences using a supercapacitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0119", "title": "Designing an Inference Duty Cycle on 0.5 mW Solar", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many 15 ms Ambiq Apollo4 keyword inferences per hour can a 0.5 mW solar cell sustain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0120", "title": "On-Device Anomaly Detection System", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the anomaly detection pipeline, including feature extraction, model architecture, and the threshold calibration strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0121", "title": "Power-Aware Inference Scheduler", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a power-aware scheduler that meets all real-time deadlines while maximizing battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0123", "title": "The Sub-Millisecond Fault Detector", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a sub-1 ms Cortex-M4 vibration fault detector for a 10,000 RPM motor?", "chain_ids": ["tinyml-chain-auto-secondary-003-13"], "chain_positions": {"tinyml-chain-auto-secondary-003-13": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0124", "title": "The Solar Harvesting Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "When can you run inference within the power budget, and how do you handle cloudy days?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 3}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0125", "title": "Always-On Multi-Modal Sensor Fusion System", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the full sensor fusion architecture, specifying which sensors are always-on vs triggered, the fusion model, and the power budget?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0128", "title": "The Battery Life Equation", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long will a 300 mAh 3.0V coin cell power a Cortex-M4 gesture model running 30 ms once per second?", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 2}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0129", "title": "The Sleep Mode Wake-Up Cost", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an nRF52840 sound classifier average 300 µA when System OFF is only 0.3 µA and inference is 25 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0130", "title": "The Energy Harvesting Wall", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the maximum inference rate you can sustain indefinitely without a battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0131", "title": "The Voltage Scaling Tightrope", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do 3% of STM32L5 wildlife sensors misclassify after dropping from 1.2 V to 0.9 V at 26 MHz?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0134", "title": "The Flash Page Erase Block", "topic": "real-time-deadlines", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 20-byte LittleFS log write to SPI flash stall for 500 ms and drop 10 audio frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0135", "title": "TinyML Model Serving Pipeline", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule camera DMA, preprocessing, inference, and GPIO actuation within a 200 ms conveyor window to guarantee zero missed products?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0136", "title": "MCU-Based Edge AI Gateway", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can the ESP32-S3 handle this workload, and what are the critical system bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0137", "title": "The 4x Integer Speedup", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When using the Arm CMSIS-NN library, what is the theoretical throughput gain for 8-bit integer operations that can be fully parallelized, compared to a naive C implementation?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 0}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1x (no speedup without a hardware FPU)", "2x (assuming only dual-MAC instructions apply)", "4x (packing four 8-bit integers into a 32-bit register)", "32x (confusing register width with SIMD throughput)"], "correct_index": 2}}, {"id": "tinyml-0138", "title": "The Flash vs. SRAM Divide", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Where are the convolutional filter values primarily located, and where is the tensor arena for calculating activations allocated?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 0}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both the weights and the tensor arena are loaded into SRAM.", "The weights are loaded into SRAM, and the tensor arena is allocated in Flash.", "The weights are stored in Flash, and the tensor arena is allocated in SRAM.", "Both the weights and the tensor arena are allocated in Flash memory."], "correct_index": 2}}, {"id": "tinyml-0139", "title": "The Requantization Shift", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the final arithmetic operation needed to produce the final INT8 value, `q_out`, before saturation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Subtract the output zero-point (`Z_out`) (Yields 60 instead of 40 in example).", "No operation is needed, just cast to INT8 (Yields 50 instead of 40).", "Add the output zero-point (`Z_out`).", "Multiply by the output zero-point (`Z_out`) (Causes overflow/clipping)."], "correct_index": 2}}, {"id": "tinyml-0140", "title": "The Depthwise Separable Cost Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What parameter-count reduction should you expect from replacing a standard 3x3 convolution with a 3x3 depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2-3x reduction", "Roughly 8-9x reduction", "Roughly 50-100x reduction", "No reduction in parameters, only in FLOPs"], "correct_index": 1}}, {"id": "tinyml-0141", "title": "The 1 Millisecond Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical total latency budget for an interrupt-driven audio pipeline in a TinyML context?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 0}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "1 ms", "50 ns"], "correct_index": 2}}, {"id": "tinyml-0144", "title": "The Nanosecond Heist", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What operation timescale must a power-analysis attacker resolve to distinguish Cortex-M4 flash reads or individual instructions?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 0}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Milliseconds (ms)", "Microseconds (μs)", "Nanoseconds (ns)", "Picoseconds (ps)"], "correct_index": 2}}, {"id": "tinyml-0147", "title": "The SRAM Overflow Trap", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total SRAM requirement and explain why the crash is happening?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model fits. The 90 KB tensor arena is smaller than the 128 KB of available SRAM.", "The model is too large. The 250 KB model file must be loaded from Flash into the 128 KB SRAM, which is impossible.", "The device is out of SRAM. The required 138 KB (90 KB arena + 48 KB system) exceeds the 128 KB available. The model's 250 KB file size is for Flash storage, not runtime RAM.", "The device is out of Flash. The 250 KB model and 48 KB system SRAM don't leave enough space in the 512 KB of Flash for the OS."], "correct_index": 2}}, {"id": "tinyml-0149", "title": "The Flash vs. SRAM Budget", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should you separately budget Flash for code/weights/OTA storage versus SRAM for runtime data?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 1}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it doesn't fit because SRAM (128 KB) + model weights (400 KB) = 528 KB > 256 KB.", "Yes, it fits. Flash usage is 946 KB and SRAM usage is 128 KB.", "No, it doesn't fit because Flash usage is 400+64+32 = 496 KB, making it impossible to add OTA.", "No, it doesn't fit because total memory is 946 + 128 = 1074 KB > 1024 KB."], "correct_index": 1}}, {"id": "tinyml-0151", "title": "The Cost of Unoptimized C Code", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long should a 5M-MAC KWS inference take on a 168 MHz Cortex-M4 when unoptimized INT8 C costs 4 cycles per MAC?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 1}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~30 ms", "~60 ms", "~119 ms", "~238 ms"], "correct_index": 2}}, {"id": "tinyml-0152", "title": "The CMSIS-NN SIMD Dividend", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the new, optimized inference time on the same 168 MHz MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~119.0 ms (Assuming 4 cycles per MAC without SIMD)", "~59.5 ms (Assuming 2 cycles per MAC)", "~29.8 ms", "~7.4 ms (Assuming an impossible 4 MACs per cycle on INT8)"], "correct_index": 2}}, {"id": "tinyml-0153", "title": "The Real-Time MAC Budget", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum MAC count for a Cortex-M4 voice model that must finish within 33 ms at 168 MHz and 1 MAC per cycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.8 Million MACs", "~5.1 Million MACs", "~5.5 Million MACs", "~168 Million MACs"], "correct_index": 2}}, {"id": "tinyml-0155", "title": "The Cortex-M7 MAC Budget", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the model's computational needs to the MCU's capabilities and explain if the MCU can handle the load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, easily. The MCU has 480M cycles and the model only needs 9M.", "Yes, with about 50% headroom.", "No, the required 450 MMACs/sec is too close to the MCU's 480 MMACs/sec peak.", "No, it requires 900 MMACs/sec, which is double the MCU's capability."], "correct_index": 2}}, {"id": "tinyml-0157", "title": "The Great Flash/SRAM Divide", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Where should read-only quantized weights and the read-write Tensor Arena reside on a 1MB Flash, 256KB SRAM microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Weights are stored in SRAM for speed; the Tensor Arena is in Flash.", "Both the weights and the Tensor Arena are placed in SRAM.", "Weights are stored in Flash; the Tensor Arena is allocated in SRAM.", "Both the weights and the Tensor Arena are placed in Flash to save SRAM."], "correct_index": 2}}, {"id": "tinyml-0159", "title": "The Whole-Graph Arena Plan", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you compute the minimum tensor arena size for a two-op TFLM model with buffer reuse and runtime tail overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75 KB", "145 KB", "85 KB", "80 KB"], "correct_index": 2}}, {"id": "tinyml-0160", "title": "The 1-Millisecond Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical total latency budget you have, from the interrupt firing to classifying the event, to meet a hard real-time deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 ms", "33 ms", "1 ms", "100 µs"], "correct_index": 2}}, {"id": "tinyml-0163", "title": "The MCU Performance Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a typical Cortex-M4 ML model more likely compute-bound or SRAM-bandwidth-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It is memory-bound; fetching weights from SRAM is the bottleneck.", "It is compute-bound; the processor's calculation speed is the bottleneck.", "It is I/O-bound; the SPI bus for sensor data is the bottleneck.", "The compute and memory are perfectly balanced."], "correct_index": 1}}, {"id": "tinyml-0164", "title": "Microcontroller Arithmetic Intensity", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Based on your calculation, are these devices generally compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.57 FLOPs/Byte (Memory-bound)", "~280 FLOPs/Byte (Compute-bound)", "~0.5 FLOPs/Byte (Compute-bound)", "~0.25 FLOPs/Byte (Memory-bound)"], "correct_index": 2}}, {"id": "tinyml-0165", "title": "The 10mW Power Budget", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What compute rate can a 10 mW solar budget sustain on a 20 GOPS/W accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 GOPS", "2 GOPS", "0.2 GOPS (or 200 MOPS)", "0.002 GOPS (or 2 MOPS)"], "correct_index": 2}}, {"id": "tinyml-0166", "title": "The SRAM Budget Constraint", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical on-chip SRAM size you can expect to work with for the Tensor Arena?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 0}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 GB", "~2 MB", "~256 KB", "~50 mW"], "correct_index": 2}}, {"id": "tinyml-0169", "title": "The DMA Dividend", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many CPU cycles does DMA save when capturing 1 second of 16 kHz 16-bit audio versus PIO at 10 cycles per sample?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 0}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["PIO: 32,000 cycles, DMA: 200 cycles", "PIO: 16,000 cycles, DMA: 200 cycles", "PIO: 160,000 cycles, DMA: 200 cycles", "PIO: 1,600,000 cycles, DMA: 200 cycles"], "correct_index": 2}}, {"id": "tinyml-0171", "title": "The Depthwise Separable Compute Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately what is the computational savings factor you should expect when replacing a standard 3x3 convolution with a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3x", "~9x", "~64x (the number of input channels)", "Over 100x"], "correct_index": 1}}, {"id": "tinyml-0173", "title": "The 1ms Interrupt Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the system feels instantaneous and never misses an audio packet, what is the typical latency budget for the Interrupt Service Routine (ISR) that handles this trigger?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 ms", "~33 ms", "~16 ms", "~1 ms"], "correct_index": 3}}, {"id": "tinyml-0174", "title": "The Dropped Audio Frame", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a Cortex-M4 process a 400 MFLOP, 1-second audio clip in real time without falling behind?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it takes ~1.19 seconds to process, which is longer than the 1-second audio clip duration.", "Yes, it only uses 84% of the processor's capacity (336 MFLOPS / 400 MFLOPs).", "No, it takes ~2.38 seconds because the 168 MHz clock speed only provides 168 MFLOPS.", "Yes, it can process two clips per second, taking about 0.59 seconds per clip."], "correct_index": 0}}, {"id": "tinyml-0175", "title": "The Flash Budget Crunch", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How would you calculate the minimum level of unstructured weight pruning (as a percentage of sparsity) needed to fit the model into the remaining Flash?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0% (No pruning is needed)", "58% Sparsity", "20% Sparsity", "84% Sparsity"], "correct_index": 2}}, {"id": "tinyml-0178", "title": "The Sensor Bandwidth Chasm", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much faster is reading from on-chip SRAM compared to a standard I2C bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly the same speed", "About 100x faster", "About 24,000x faster", "About 1,000x faster"], "correct_index": 2}}, {"id": "tinyml-0180", "title": "The Federated Learning Energy Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the daily energy consumption for data transmission per device for each approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: 50 J, Federated: 12.5 J. The federated approach is 4x more energy efficient.", "Centralized: 40 J, Federated: 100 J. The centralized approach is 2.5x more energy efficient.", "Centralized: 400 J, Federated: 100 J. The federated approach is 4x more energy efficient.", "Centralized: 40 mJ, Federated: 10 mJ. The difference is negligible at the fleet level."], "correct_index": 2}}, {"id": "tinyml-0181", "title": "The Microcontroller's Memory Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate Ridge Point for a typical Cortex-M4 microcontroller, and what does this value signify for ML workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~336 Ops/Byte (Assuming peak MFLOPS directly maps to Ops/Byte without bandwidth division).", "~1.2 Ops/Byte (Confusing GB/s with Ridge Point directly).", "~0.28 Ops/Byte. It's heavily compute-bound.", "~168 Ops/Byte (Using clock speed instead of MFLOPS)."], "correct_index": 2}}, {"id": "tinyml-0183", "title": "The Race-to-Sleep Dilemma", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can a higher-power Cortex-M7 consume less energy than a Cortex-M4 for a fixed 100 MFLOP keyword-spotting inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4, because its active power rating (30mW) is lower.", "The Cortex-M7, because it finishes the computation faster, spending less time in an active state.", "They are equally energy-efficient because the total number of FLOPs is the same for both.", "It's impossible to know without the sleep power consumption for each MCU."], "correct_index": 1}}, {"id": "tinyml-0184", "title": "The TinyML Memory Wall: SRAM vs. Flash", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "On a typical microcontroller used for a TinyML application, roughly how much slower is reading the model's weights from Flash memory compared to accessing the tensor arena in on-chip SRAM?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 1}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x slower", "About 1,000x slower", "About 10-15x slower", "They are nearly the same speed"], "correct_index": 2}}, {"id": "tinyml-0185", "title": "The SRAM Tensor Arena Puzzle", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What minimum Tensor Arena size is needed for the keyword-spotting execution plan based on peak concurrent tensors?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 0}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["54 KB, the sum of all tensor sizes.", "30 KB, the size of the largest tensor.", "42 KB, the peak concurrent memory usage during the first layer's operation.", "40 KB, the peak concurrent memory usage during the second layer's operation."], "correct_index": 2}}, {"id": "tinyml-0186", "title": "The DMA Power-Saving Trade-Off", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which transfer approach minimizes energy for the 4 KB sensor sample, CPU memcpy or DMA with CPU sleep?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["CPU copy, because its total latency is lower (1.5 µJ vs 1.75 µJ calculated improperly).", "DMA, because it allows the power-hungry CPU to sleep during the transfer, saving significant energy.", "CPU copy, because the DMA setup overhead makes it inefficient for small data transfers.", "They are equivalent in power consumption because the transfer time is the same in both scenarios."], "correct_index": 1}}, {"id": "tinyml-0190", "title": "The Millisecond Machine Stop", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical, non-negotiable latency budget for such a hard real-time interrupt?", "chain_ids": ["tinyml-chain-auto-secondary-003-13"], "chain_positions": {"tinyml-chain-auto-secondary-003-13": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms (Typical for a cloud service response)", "33 ms (Typical for a real-time video frame on an edge device)", "16 ms (The budget to avoid UI 'jank' on a mobile device)", "1 ms (The budget for a hardware interrupt)"], "correct_index": 3}}, {"id": "tinyml-0192", "title": "The Energy Harvesting Deficit", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the Cortex-M7 air-quality sensor run indefinitely from a 2.0 mW solar panel while waking for 0.5 s every 10 s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it is sustainable; sleep power is negligible compared to generation.", "No, it is not sustainable; active power of 50mW far exceeds generation.", "No, it is not sustainable; it has a net energy deficit of ~0.5mW.", "Yes, it is sustainable; it has a net energy surplus of ~1.5mW."], "correct_index": 2}}, {"id": "tinyml-0193", "title": "The Energy Cost of Privacy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For the purpose of total device energy consumption, which of these two operations is more expensive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Running the model locally is more expensive; ML compute is always the bottleneck.", "The energy costs are roughly equal.", "Transmitting the image is ~20x more expensive.", "Transmitting the image is over 1,000x more expensive."], "correct_index": 2}}, {"id": "tinyml-0194", "title": "The Federated Learning Battery Dividend", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the difference in annual energy consumption between the two strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Centralized strategy consumes approximately 11.0 Joules more per year.", "The Centralized strategy consumes approximately 109.5 Joules more per year.", "The Centralized strategy consumes approximately 98.6 Joules more per year.", "The Centralized strategy consumes approximately 0.27 Joules more per year."], "correct_index": 2}}, {"id": "tinyml-0195", "title": "The Microcontroller Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'ridge point' for a Cortex-M4, and what does its value imply about where performance bottlenecks are likely to occur?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte, meaning most models are memory-bound.", "~0.2 Ops/Byte, meaning most models are memory-bound.", "~0.2 Ops/Byte, meaning most models are compute-bound.", "~20 Ops/Byte, meaning models are well-balanced between compute and memory."], "correct_index": 2}}, {"id": "tinyml-0198", "title": "The DMA Double-Buffer Lifeline", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much SRAM is required for the double-buffered 16 kHz, 16-bit audio pipeline when inference takes 120 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3,840 bytes", "30,720 bytes", "7,680 bytes", "32,000 bytes"], "correct_index": 2}}, {"id": "tinyml-0199", "title": "The Quantization Energy Dividend", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately how much more energy does a single 32-bit floating-point (FP32) operation consume compared to an 8-bit integer (INT8) operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Around 4x more energy.", "Around 18x more energy.", "Around 8x more energy.", "The energy savings are negligible (~1.2x)."], "correct_index": 1}}, {"id": "tinyml-0202", "title": "The Real-Time Deadline Trap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How long after the second audio chunk arrives will its processing complete on the single-threaded Cortex-M4?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 0}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["600ms", "1000ms", "1200ms", "400ms"], "correct_index": 2}}, {"id": "tinyml-0203", "title": "The Real-Time Wakeword Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 168 MHz Cortex-M4 finish a 70 MFLOP KWS inference before the next 1000 ms audio chunk arrives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it will take ~417 ms.", "No, it will take ~4.8 seconds.", "Yes, it will take ~208 ms.", "Yes, it will take only 0.208 ms."], "correct_index": 2}}, {"id": "tinyml-0204", "title": "The Solar-Powered Sensor's Inference Budget", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To remain power-neutral, what is the maximum number of inferences the station can perform per hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["450 inferences per hour", "1,796 inferences per hour", "1,800 inferences per hour", "18,000 inferences per hour"], "correct_index": 1}}, {"id": "tinyml-0205", "title": "The Privacy-First Principle of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason to choose a Federated Learning approach for model updates instead of collecting all audio data in the cloud to retrain a central model?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 0}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To achieve higher model accuracy than a centrally trained model.", "To lower the power consumption of the device during the learning process.", "To preserve user privacy by not sending raw audio data to the cloud.", "To reduce the network bandwidth costs of downloading the final, large model."], "correct_index": 2}}, {"id": "tinyml-0207", "title": "The Microcontroller's Compute Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What does the Ridge Point of a roofline model for a Cortex-M4 indicate about its performance, and what is its approximate value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~5.0 Bytes/Op, meaning most workloads are memory-bound. (Inverted ratio trap)", "~1.6 Ops/Byte, meaning memory and compute are perfectly balanced. (Forgot 8-bit to 32-bit word size multiplier)", "~0.2 Ops/Byte, meaning most workloads are compute-bound.", "~0.025 Ops/Bit, meaning memory bandwidth is severely constrained. (Bit vs Byte confusion trap)"], "correct_index": 2}}, {"id": "tinyml-0208", "title": "TinyML Roofline: Compute or Memory Bound?", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the layer's arithmetic intensity, and is it compute-bound or memory-bound on the Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because its intensity is ~2.07 Bytes/Op.", "Memory-bound, because an Arithmetic Intensity of ~0.48 Ops/Byte is very low.", "Compute-bound, because its Arithmetic Intensity (~0.48 Ops/Byte) is higher than the Cortex-M4's ridge point (~0.28 Ops/Byte).", "Compute-bound, because all ML operations on microcontrollers are limited by CPU speed."], "correct_index": 2}}, {"id": "tinyml-0209", "title": "TinyML Tensor Arena Sizing", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To prevent memory allocation errors, what is the minimum required size for the Tensor Arena?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["25 KB", "49 KB", "35 KB", "37 KB"], "correct_index": 2}}, {"id": "tinyml-0210", "title": "DMA vs. CPU for Sensor Data", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much CPU time does a direct memcpy of the 1-second audio buffer take compared with a DMA transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["DMA is slower due to a 1.5ms setup overhead that exceeds the memcpy time.", "The CPU copy takes ~0.76ms, while the DMA transfer takes ~0 CPU time.", "Both take ~0.76ms because they share the same physical memory bus.", "The CPU copy takes ~7.6ms because you need 40 cycles/byte."], "correct_index": 1}}, {"id": "tinyml-0211", "title": "The Quantization Energy Cliff", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "From a pure hardware physics perspective, approximately how much more energy does a single FP32 compute operation consume compared to a single INT8 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4×", "~3.4×", "~18×", "~100×"], "correct_index": 2}}, {"id": "tinyml-0212", "title": "The Depthwise Convolution Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What approximate compute reduction does a 3x3 depthwise separable convolution provide over a standard convolution?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 0}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About a 3x reduction", "The savings are negligible on microcontrollers", "About a 9x reduction", "It's a 2x reduction, same as using FP16 instead of FP32"], "correct_index": 2}}, {"id": "tinyml-0213", "title": "The Unforgiving Audio Buffer", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To guarantee you never lose any incoming audio, what is the absolute hard real-time deadline by which inference must complete?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1ms", "33ms", "100ms", "16ms"], "correct_index": 2}}, {"id": "tinyml-0214", "title": "The Dropped Audio Packet", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the Cortex-M4 process the 200 MFLOP audio chunk within the strict 1000 ms real-time deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it would take over 500 seconds to process one chunk.", "No, it can only process about 1.68 chunks per second, which is too slow.", "Yes, it takes about 595ms, which is less than the 1000ms deadline.", "Yes, but the 95ms of slack time is too small for a production system."], "correct_index": 2}}, {"id": "tinyml-0215", "title": "The Sleep-Wake Power Chasm", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the approximate ratio of active power consumption to deep sleep power consumption for a typical Cortex-M4 class microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10x", "~100x", ">1,000x", "They are roughly the same"], "correct_index": 2}}, {"id": "tinyml-0216", "title": "The TCO of TinyML: On-Device vs. Cloud Power", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the average power draws of the TinyML and cloud approaches over the 60-second cycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud peak power is 200 mW and TinyML is 10 mW, so it uses 20x more power.", "The TinyML device uses about 10 mW on average because the sleep power is negligible.", "Cloud uses ~16.7 mW and TinyML uses ~0.18 mW, a difference of nearly 100x.", "Both are in the low mW range; the power difference is not significant for TCO."], "correct_index": 2}}, {"id": "tinyml-0217", "title": "The Microcontroller's Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Given its hardware specifications, what is the primary performance bottleneck you would typically expect to encounter according to the Roofline Model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, as the MCU's MAC throughput is the main constraint.", "Memory-bound, as the data movement from SRAM to the CPU is the main constraint.", "Power-bound, as the MCU cannot draw enough power to run at its peak frequency.", "Flash-bound, as reading the model weights from flash storage is the bottleneck."], "correct_index": 0}}, {"id": "tinyml-0219", "title": "The SRAM Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What tensor-arena peak is required for 40 KB input, 32 KB Conv1, and 8 KB Conv2 tensors on a 256 KB MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 KB", "32 KB", "72 KB", "40 KB"], "correct_index": 2}}, {"id": "tinyml-0220", "title": "The Hidden Cost of CPU `memcpy`", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What CPU opportunity cost does a CPU-driven `memcpy` impose for a 4 KB audio frame compared to using DMA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is busy for ~24µs with memcpy, which is negligible in a 20ms budget.", "DMA is slower because it has configuration overhead.", "The CPU is busy for ~24µs with memcpy, stalling inference, while it's only busy for <1µs with DMA, enabling parallelism.", "Both methods take roughly the same time since SRAM bandwidth is the bottleneck."], "correct_index": 2}}, {"id": "tinyml-0223", "title": "The Interrupt Latency Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hard real-time latency budget should an interrupt-triggered TinyML inference target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "16 ms", "1 ms"], "correct_index": 3}}, {"id": "tinyml-0224", "title": "The Real-Time Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the system meet its hard real-time deadline of finishing one clip before the next 1-second clip arrives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~298 ms", "~303 ms", "~3.36 s", "~8 ms"], "correct_index": 1}}, {"id": "tinyml-0227", "title": "The TinyML Update Cost Fallacy", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the numbers provided, which of the following is the largest cost associated with this single, global update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cloud compute cost (GPU hours) to train the new model.", "The aggregate cellular data transmission cost.", "The electricity cost for all one million devices to power their modems during the download.", "The initial hardware (CapEx) cost of the microcontrollers in the fleet."], "correct_index": 1}}, {"id": "tinyml-0228", "title": "The TinyML Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What does the 0.28 Ops/Byte ridge point mean on a Cortex-M4 roofline plot?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 0}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The peak theoretical performance (MFLOPS) of the microcontroller.", "The maximum memory bandwidth (GB/s) of the on-chip SRAM.", "The minimum Arithmetic Intensity (Ops/Byte) needed to be compute-bound.", "The energy cost (in pJ) of a single memory access."], "correct_index": 2}}, {"id": "tinyml-0229", "title": "The MCU Memory Wall", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the Cortex-M4 ridge point, and what does it imply about whether typical ML layers are compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~280 Ops/Byte, making it profoundly memory-bound.", "~3.57 Bytes/Op, meaning it's inefficient.", "~0.28 Ops/Byte, making it profoundly memory-bound.", "~0.28 Ops/Byte, making it typically compute-bound."], "correct_index": 3}}, {"id": "tinyml-0232", "title": "The Depthwise Separable Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary advantage of using a depthwise separable convolution instead of a standard convolution in this context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly increases model accuracy by capturing more complex features.", "It allows the model to handle variable-length audio inputs without padding.", "It dramatically reduces the number of parameters and computations (MACs).", "It's inherently more robust to noise in the audio signal."], "correct_index": 2}}, {"id": "tinyml-0234", "title": "The Real-Time Radar Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which performance metric is the most critical to optimize to meet this product requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Throughput (inferences/sec)", "Time To First Token (TTFT)", "End-to-End Latency", "Power Consumption (mW)"], "correct_index": 2}}, {"id": "tinyml-0241", "title": "Microcontroller Performance Reality", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on its fundamental hardware architecture, would you expect the model's performance to be limited by its compute capability (compute-bound) or by its memory bandwidth (memory-bound)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because all neural networks are memory-bound.", "Memory-bound, because microcontrollers have very little SRAM.", "Compute-bound, because the FPU is weak relative to the fast SRAM bandwidth.", "I/O bound, because reading from the microphone sensor is the slowest part."], "correct_index": 2}}, {"id": "tinyml-0246", "title": "The Depthwise Memory Footprint", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What parameter reduction comes from replacing a 3x3 16-to-32 standard convolution with depthwise separable convolution?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 1}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A ~32x reduction in parameters", "A ~2x reduction in parameters", "A ~7x reduction in parameters", "No significant change in parameters"], "correct_index": 2}}, {"id": "tinyml-0250", "title": "The TinyML Memory Diet: MCU Compute Constraints", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate and explain the difference in the model's weight storage before and after INT8 quantization?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 0}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model size is reduced from 1000 KB to 250 KB, saving 750 KB.", "The model size is reduced by a factor of 8x because you go from 16-bit floats to 8-bit integers.", "The model size is reduced from 500 KB to 250 KB, saving 250 KB.", "The model size is reduced from 500 KB to 125 KB, saving 375 KB."], "correct_index": 2}}, {"id": "tinyml-0254", "title": "The SRAM Tensor Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum tensor arena size required when the 40 KB input, 20 KB intermediate, and 10 KB output tensors overlap?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 1}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["70 KB", "40 KB", "60 KB", "30 KB"], "correct_index": 2}}, {"id": "tinyml-0256", "title": "The SRAM Memory Ceiling", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What typical on-chip SRAM range should you assume for a Cortex-M4 class TinyML device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["80 GB", "8 GB", "256 KB - 2 MB", "32 MB"], "correct_index": 2}}, {"id": "tinyml-0258", "title": "The Audio Buffer Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To ensure no audio data is ever lost, what is the absolute maximum processing latency your model can have to process one buffer?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 1}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["62.5 us", "640 ms", "1000 ms", "500 ms"], "correct_index": 2}}, {"id": "tinyml-0259", "title": "The Truck Roll Multiplier", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Over the product's 5-year lifespan, what single factor is most likely to dominate the Total Cost of Ownership (TCO)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud computing costs for aggregating data and training model updates.", "Initial hardware purchase (CapEx) of the 100,000 devices.", "Physical maintenance and battery replacement ('truck rolls').", "Energy consumption of the entire device fleet over 5 years."], "correct_index": 2}}, {"id": "tinyml-0261", "title": "The Interrupt Deadline: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hard real-time latency budget is typical for interrupt-driven TinyML inference?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 ms", "~33 ms", "~16 ms", "~1 ms"], "correct_index": 3}}, {"id": "tinyml-0262", "title": "The TinyML Compute Threshold", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'ridge point'—the minimum arithmetic intensity (Ops/Byte) required for a workload to become compute-bound on this class of device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte", "~10 Ops/Byte", "~0.5 Ops/Byte", "~50 Ops/Byte"], "correct_index": 2}}, {"id": "tinyml-0263", "title": "The MCU Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the approximate ridge point for this MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~280 Ops/Byte", "~3.57 Bytes/Op", "~0.28 Ops/Byte", "~1,342 Ops/Byte"], "correct_index": 2}}, {"id": "tinyml-0266", "title": "The SRAM vs. Flash Fallacy", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate *working memory* (SRAM) you would state is actually available for the model's runtime operations like activations?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 0}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 MB", "~1 MB", "~256 KB", "~32 KB"], "correct_index": 2}}, {"id": "tinyml-0269", "title": "The TinyML Economics of Inference: On-Device vs. Cloud", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which approach is more energy-efficient for hourly keyword checks, on-device inference or cloud-assisted transmission?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud-assisted is more efficient because it avoids running complex computation on the low-power device.", "The energy difference is negligible because sleep power dominates the total consumption over 24 hours.", "On-device is >10x more energy-efficient because the radio transmission power is far greater than the local compute power.", "They are roughly equal; the energy saved from not computing locally is offset by the energy spent on transmission."], "correct_index": 2}}, {"id": "tinyml-0273", "title": "The INT8 Energy Dividend", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much more energy-efficient is a single INT8 MAC operation compared to a single FP32 MAC operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.4x more efficient", "~4x more efficient", "~18x more efficient", "~580x more efficient"], "correct_index": 2}}, {"id": "tinyml-0274", "title": "The Depthwise Efficiency Dividend", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What MAC reduction should a 3x3 depthwise separable convolution deliver relative to a standard 3x3 convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It is about 2x cheaper.", "They are computationally equivalent.", "It is about 9x cheaper.", "It is about 9x more expensive."], "correct_index": 2}}, {"id": "tinyml-0275", "title": "The Depthwise Separable Memory Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What parameter reduction results from replacing the 3x3 16-to-32 channel convolution with a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A ~32x reduction.", "A ~9x reduction.", "A ~7x reduction.", "A ~2x reduction."], "correct_index": 2}}, {"id": "tinyml-0278", "title": "The Microcontroller's Low Ridge", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the hardware arithmetic intensity (Ridge Point) of a Cortex-M4, where does the bottleneck lie for most neural network operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ridge point is very high (~280 Ops/Byte), making workloads memory-bound.", "The ridge point is very low (~0.28 Ops/Byte), but this means workloads are always memory-bound.", "The ridge point is very low (~0.28 Ops/Byte), making most neural network workloads compute-bound.", "The ridge point is irrelevant for microcontrollers; only power matters."], "correct_index": 2}}, {"id": "tinyml-0281", "title": "The INT8 Energy Prize", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What per-operation energy savings should FP32-to-INT8 quantization provide for the coin-cell keyword spotting model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x savings", "~3.4x savings", "~18x savings", "~100x savings"], "correct_index": 2}}, {"id": "tinyml-0284", "title": "The Energy Tax of the Cloud", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which energy component dominates TCO for the always-listening smart home sensor, on-device compute or network streaming?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The on-device compute energy, because neural network FLOPs are expensive.", "The network streaming energy and on-device compute are roughly equal.", "The network streaming energy, by several orders of magnitude.", "The network streaming energy, but only by a small amount."], "correct_index": 2}}, {"id": "tinyml-0285", "title": "The Privacy Premium: On-Device vs. Cloud TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming no other differing costs, what is the Total Cost of Ownership (TCO) difference between the two architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$165,000 cheaper", "$795,000 cheaper", "$895,000 cheaper", "$1,095,000 cheaper"], "correct_index": 2}}, {"id": "tinyml-0286", "title": "The Microcontroller's Low Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What fundamental performance bottleneck does this extremely low value imply for most neural network workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The device is almost always memory-bound.", "The device is almost always compute-bound.", "The device is almost always power-bound.", "The device's performance is limited by its flash storage speed."], "correct_index": 1}}, {"id": "tinyml-0287", "title": "The MCU Ridge Point: MCU Compute Constraints", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Calculate the Ridge Point for this Cortex-M4 and interpret what it means for the relationship between compute and memory.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.57 Bytes/Op. It means most workloads will be memory-bound.", "~280 Ops/Byte. It means the chip requires extremely high arithmetic intensity to be compute-bound.", "~0.28 Ops/Byte. It means most ML workloads will be compute-bound.", "~1.4 Ops/Byte. This would be typical for a more powerful edge device, not an MCU."], "correct_index": 2}}, {"id": "tinyml-0290", "title": "The Depthwise Separable Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does replacing a 3x3 standard convolution with depthwise separable convolution help a Cortex-M4 CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It primarily reduces the model's parameter count and flash memory footprint.", "It significantly improves the model's prediction accuracy.", "It primarily reduces the number of computational operations (FLOPs).", "It enables the use of specialized hardware instructions on the microcontroller."], "correct_index": 2}}, {"id": "tinyml-0291", "title": "The Depthwise Memory Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total number of weight parameters required for this single depthwise separable layer, assuming INT8 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["432 parameters", "27 parameters", "75 parameters", "48 parameters"], "correct_index": 2}}, {"id": "tinyml-0292", "title": "The Tyranny of Sleep Current", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To meet a 5-year battery life goal, which power metric is the most critical to minimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Active power during inference (mW)", "Deep sleep power consumption (µW)", "Model size on Flash (KB)", "Peak compute performance (MFLOPS)"], "correct_index": 1}}, {"id": "tinyml-0294", "title": "The Microcontroller's Compute Limit", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'ridge point' (operational intensity) for this class of device, and what does it tell you about where the bottleneck usually is?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte; most workloads are memory-bound.", "~0.2 Ops/Byte; most workloads are memory-bound.", "~0.2 Ops/Byte; most workloads are compute-bound.", "~1,300 Ops/Byte; workloads are balanced between compute and memory."], "correct_index": 2}}, {"id": "tinyml-0296", "title": "The TinyML SRAM Budget", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What SRAM footprint should you count for the keyword-spotting deployment, and why is the Flash-stored model size irrelevant?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 0}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["272 KB. It won't fit.", "68 KB. It will fit.", "92 KB. It will fit.", "164 KB. It will fit."], "correct_index": 2}}, {"id": "tinyml-0300", "title": "The Power Budget Chasm", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate ratio of power consumed by a typical TinyML microcontroller when actively running inference versus when it is in deep sleep?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A. ~10x", "B. ~100x", "C. ~10,000x", "D. ~1,000,000x"], "correct_index": 2}}, {"id": "tinyml-0301", "title": "The Economics of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Calculate the total daily data upload volume for this federated system.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 GB", "500 KB", "5 GB", "5 MB"], "correct_index": 2}}, {"id": "tinyml-0302", "title": "The TinyML Memory Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'Ridge Point' for a typical Cortex-M4, which tells us its operational intensity in Ops-per-Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~336 Ops/Byte (Using MFLOPS directly, ignoring bandwidth).", "~1.2 Ops/Byte (Confusing bandwidth with the Ridge Point).", "~0.28 Ops/Byte (Correct calculation: 336 MFLOPS / 1.2 GB/s).", "~168 Ops/Byte (Using MHz instead of MFLOPS)."], "correct_index": 2}}, {"id": "tinyml-0311", "title": "The MCU's Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the dense layer's arithmetic intensity, and is it compute-bound or memory-bound on the Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because the layer's Arithmetic Intensity (~1.95 Ops/Byte) is greater than the MCU's ridge point.", "Memory-bound, because all operations on resource-constrained devices like microcontrollers are limited by memory bandwidth.", "Compute-bound, because the layer's Arithmetic Intensity (~1.95 Ops/Byte) is greater than the MCU's ridge point (~0.28 Ops/Byte).", "Compute-bound, because its Arithmetic Intensity is low, which means it doesn't require much data from memory."], "correct_index": 2}}, {"id": "tinyml-0314", "title": "The Separable Convolution Cost-Cutter", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary advantage of this architectural change for a resource-constrained device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0319", "title": "The Microcontroller Roofline", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of the FP32 3x3 depthwise convolution, and is it compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.2 Ops/Byte; Memory-Bound", "~1.1 Ops/Byte; Compute-Bound", "~2.2 Ops/Byte; Compute-Bound", "~0.28 Ops/Byte; Memory-Bound"], "correct_index": 2}}, {"id": "tinyml-0321", "title": "The Energy Cost of Precision: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much more energy does a 16-bit floating-point (FP16) multiplication consume compared to an 8-bit integer (INT8) multiplication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x more energy", "Roughly the same, as energy is dominated by memory access", "About 5x more energy", "About 18x more energy"], "correct_index": 2}}, {"id": "tinyml-0322", "title": "The Kilobyte Wall", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the typical hardware specifications for TinyML devices, which resource constraint will you almost certainly hit first and is generally the hardest to overcome?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute (MFLOPS)", "Power Draw (mW)", "Memory (SRAM)", "Flash Storage Size"], "correct_index": 2}}, {"id": "tinyml-0323", "title": "The Flash Budget Diet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many INT8 parameters are saved by replacing a 3x3 16-to-32 convolution with a depthwise separable layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The parameters are reduced by ~32x", "The parameters are reduced by ~9x", "The parameters are reduced by ~7x", "The parameters are reduced by ~2x"], "correct_index": 2}}, {"id": "tinyml-0326", "title": "Microcontroller's Memory Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on typical hardware specs, what is its approximate ridge point (Ops/Byte), and what does this imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte, implying workloads are always compute-bound.", "~10 Ops/Byte, implying a balance between compute and memory.", "~0.28 Ops/Byte, implying workloads are almost always compute-bound.", "The concept of a ridge point does not apply to microcontrollers, only GPUs."], "correct_index": 2}}, {"id": "tinyml-0328", "title": "The Tensor Arena Sizing Puzzle", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What tensor arena size follows from the peak coexisting tensors rather than summing every tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["71 KB", "40 KB", "60 KB", "50 KB"], "correct_index": 2}}, {"id": "tinyml-0331", "title": "The Depthwise Separable Memory Diet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much INT8 parameter memory does a 3x3, 32-input, 64-output standard convolution require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.3 KB", "~8.2 KB", "~18.0 KB", "288 Bytes"], "correct_index": 2}}, {"id": "tinyml-0332", "title": "The Privacy Power Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a hardware physics perspective, what is the primary cost associated with this privacy-enhancing computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased Flash memory required to store the computation model.", "Increased network latency to transmit the computed features.", "Increased total energy consumption due to longer active time.", "Increased peak MFLOPS demand on the microcontroller."], "correct_index": 2}}, {"id": "tinyml-0333", "title": "The TCO of a Sleeping Army", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "At $0.15 per kWh, what is the approximate total energy cost to operate this 100,000-sensor fleet for one year?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$1,314", "~$14,440", "~$14.44", "< $0.01"], "correct_index": 2}}, {"id": "tinyml-0334", "title": "The Microcontroller's Memory Wall: MCU Compute Constraints", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the fundamental architectural trade-offs of this class of device, would you generally expect the model's performance to be compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the typical arithmetic intensity of neural network layers is higher than the microcontroller's low ridge point.", "Memory-bound, because the chip's memory bandwidth is very low compared to its processing speed.", "Power-bound, because the device has a strict thermal design power (TDP) of a few milliwatts.", "It depends entirely on whether the model uses depthwise or standard convolutions."], "correct_index": 0}}, {"id": "tinyml-0335", "title": "The TinyML Compute-Memory Tradeoff", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is this convolutional layer compute-bound or memory-bound on the Cortex-M4?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 1}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because TinyML devices have very little SRAM, so they are always waiting on memory.", "Memory-bound, because the arithmetic intensity is ~0.1 Ops/Byte, which is lower than the ridge point.", "Compute-bound, because the layer's arithmetic intensity (~81 Ops/Byte) is much higher than the MCU's ridge point (~0.28 Ops/Byte).", "Compute-bound, because its power consumption is high, which means it is doing a lot of computation."], "correct_index": 2}}, {"id": "tinyml-0336", "title": "The TinyML Tensor Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak tensor arena memory is needed at Layer 5, and does it fit within 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.0 KB. The arena only needs to hold the single largest tensor (Layer 5 output).", "1.0 KB. Only the smallest input tensor matters.", "3.0 KB. The sum of the input (1.0 KB) and output (2.0 KB) tensors for the peak operation.", "6.0 KB. You must sum all tensors in the model."], "correct_index": 2}}, {"id": "tinyml-0338", "title": "The Depthwise Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate ratio of FLOPs between the standard and the optimized layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Approximately 2x reduction", "Approximately 8-9x reduction", "Approximately 64x reduction (equal to the number of output channels)", "The reduction is equal to the stride of the convolution"], "correct_index": 1}}, {"id": "tinyml-0340", "title": "The Wildlife Camera's Lifespan", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the approximate battery life in days after computing the wildlife camera's duty-cycled average power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.1 days", "~5.6 days", "~4.8 days", "~0.6 days"], "correct_index": 2}}, {"id": "tinyml-0344", "title": "The Microcontroller's Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity (in FLOPs/Byte) required to saturate the processor's compute capability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.57 Ops/Byte", "280 Ops/Byte", "0.28 Ops/Byte", "295 Ops/Byte"], "correct_index": 2}}, {"id": "tinyml-0345", "title": "The TinyML Tensor Arena: Tensor Arena Planning", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To minimize RAM usage, what is the absolute minimum size required for the tensor arena?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30 KB", "50 KB", "100 KB", "400 KB"], "correct_index": 1}}, {"id": "tinyml-0346", "title": "The INT8 Energy Payoff", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a pure computational energy perspective, what is the approximate energy saving for a single operation when you move from FP32 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It consumes ~4× less energy.", "It consumes ~2× less energy.", "It consumes ~18× less energy.", "The energy consumption is roughly the same."], "correct_index": 2}}, {"id": "tinyml-0348", "title": "The Flash Memory Bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What parameter reduction does a 3x3 depthwise separable convolution provide for the 64-to-128 channel layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2x. The savings are minor.", "Roughly 8.4x. It separates the spatial and cross-channel operations, drastically cutting parameters.", "Roughly 128x. It only performs the depthwise step, which is highly efficient.", "There is no reduction; it's a compute optimization, not a memory optimization."], "correct_index": 1}}, {"id": "tinyml-0349", "title": "The Flash Budget Squeeze", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on typical resource allocation in TinyML systems, which component represents the largest \"tax\" on your flash budget, directly competing with the size of your ML model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Real-Time Operating System (RTOS)", "The Bootloader", "The Over-the-Air (OTA) download partition", "The ML model's activation buffers stored in flash"], "correct_index": 2}}, {"id": "tinyml-0352", "title": "The Cortex-M4 Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the Roofline ridge point signify, and what is its calculated value for the given hardware specifications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.57 Ops/Byte", "0.28 Ops/Byte", "280 Ops/Byte", "295 Ops/Byte"], "correct_index": 1}}, {"id": "tinyml-0357", "title": "The TCO of Transmission", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What recurring TCO cost is reduced most by running TinyML inference on-device instead of streaming audio?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The recurring cost of cloud inference endpoints.", "The upfront Bill of Materials (BOM) cost for a more powerful microcontroller.", "The energy cost of wireless data transmission.", "The engineering cost of developing and maintaining the cloud data ingestion pipeline."], "correct_index": 2}}, {"id": "tinyml-0358", "title": "The Microcontroller Memory Wall", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the typical hardware characteristics of a Cortex-M4, what is the primary performance bottleneck you are likely facing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is compute-bound; the CPU's MFLOPS rating is the bottleneck.", "The system is memory-bound; data cannot be fed to the CPU fast enough.", "The system is power-bound; the model is causing the chip to thermally throttle.", "The system is compiler-bound; the toolchain isn't generating efficient instructions."], "correct_index": 0}}, {"id": "tinyml-0359", "title": "The Microcontroller's Low Bar", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does a 0.28 Ops/Byte ridge point on Cortex-M4 mean for choosing TinyML model architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.57 Ops/Byte. Workloads must perform over 3 operations per byte to be compute-bound.", "280 Ops/Byte. Nearly all workloads will be severely memory-bound.", "0.28 Ops/Byte. Most workloads will be compute-bound as they easily exceed this low arithmetic intensity requirement.", "29.5 Ops/Byte. This is a typical ridge point, making the choice of compute- vs memory-intensive layers critical."], "correct_index": 2}}, {"id": "tinyml-0360", "title": "The Tensor Arena Budget", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the peak required tensor arena size, and will it fit within the available SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["110 KB; No, it exceeds the 106 KB of available SRAM.", "110 KB; Yes, it fits easily within the total 256 KB of SRAM.", "50 KB; Yes, the peak is determined by the largest tensor, which fits.", "90 KB; Yes, it fits with room to spare."], "correct_index": 0}}, {"id": "tinyml-0364", "title": "The Economics of Fleet Updates: Centralized vs. Federated", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which update strategy costs less in data transfer for 1M doorbells: centralized image uploads or federated gradients?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 1}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized requires ~10 GB ($0.90); Federated requires ~600 GB ($54).", "Centralized requires ~1,500 GB; Federated requires ~1,500 GB.", "Centralized requires ~1,500 GB ($135); Federated requires ~600 GB ($54). Federated is 2.5x cheaper.", "Federated requires ~1,500 GB; Centralized requires ~600 GB."], "correct_index": 2}}, {"id": "tinyml-0365", "title": "The Real-Time Frame Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the maximum processing time per frame to guarantee zero dropped frames on a 10 FPS camera stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 ms", "33 ms", "100 ms", "1 ms"], "correct_index": 2}}, {"id": "tinyml-0367", "title": "The Energy Cost of Learning on the Edge", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a typical TinyML device, which single operation is the most significant contributor to its energy consumption during one federated learning cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reading the training data from on-chip Flash memory.", "Performing the on-device model training (computation).", "Transmitting the model update to the server.", "Maintaining the device in its low-power sleep state between cycles."], "correct_index": 2}}, {"id": "tinyml-0371", "title": "The INT8 Memory Diet", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the expected reduction factor for the memory occupied by the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4x", "~3.4x", "2x", "1.1x"], "correct_index": 2}}, {"id": "tinyml-0374", "title": "The Economics of On-Device Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which weekly update strategy sends less data for 1M thermostats: 100 KB raw daily uploads or 250 KB federated weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: 100 GB/day, Federated: 250 GB/day. Centralized is better.", "Centralized: 100 MB/day, Federated: ~36 MB/day. The difference is minor.", "Centralized: 100 GB/day, Federated: ~36 GB/day. Federated is ~3x more efficient.", "Centralized: 100 KB/day, Federated: 250 KB/day. Centralized is better."], "correct_index": 2}}, {"id": "tinyml-0381", "title": "The NAS Discovery on a Microcontroller", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary, most impactful reason the NAS prefers depthwise separable convolutions in such a constrained environment?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 0}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It has a higher FLOP-per-byte ratio, improving its arithmetic intensity.", "It consistently improves the final accuracy of the model.", "It dramatically reduces the number of parameters and computations.", "It is more resistant to quantization errors when converting to INT8."], "correct_index": 2}}, {"id": "tinyml-0382", "title": "The Economics of Awakening: A TinyML Power Budget", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power does the 10,000-sensor TinyML design draw when KWS runs every 10 s and radio transmits hourly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~160 mW", "~0.11 mW", "~0.30 mW", "~500 mW"], "correct_index": 2}}, {"id": "tinyml-0384", "title": "The Real-Time KWS Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Based on the hardware constants, can this MCU architecture keep up with the continuous stream of audio data without falling behind?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes. The MCU takes ~238 ms per inference (80 MFLOPs / 336 MFLOPS), which is less than the 250 ms deadline from the window stride.", "Yes, easily. The MCU's inference time of ~238 ms is much shorter than the 1000 ms audio clip, leaving over 750 ms of slack.", "No. The MCU is too slow. The required processing time is 4.2 seconds (80 MFLOPs / 20 MFLOPS at 10 MHz effective throughput), which badly misses the 250 ms deadline.", "No. The system needs to process 4 windows per second (1000ms / 250ms), requiring 320 MFLOPS (4 * 80), but the MCU only runs at 168 MHz."], "correct_index": 0}}, {"id": "tinyml-0387", "title": "The Microcontroller Roofline Dilemma", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the layer's arithmetic intensity, and does it make the Cortex-M4 execution compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound. The Bytes/FLOP ratio is ~0.51, indicating a memory bottleneck.", "Memory-bound. An arithmetic intensity of ~1.95 is high, so it needs a lot of data, saturating the memory bus.", "Compute-bound. The layer's arithmetic intensity (~1.95 FLOPs/Byte) is greater than the Cortex-M4's ridge point (~0.28 FLOPs/Byte).", "Compute-bound. 336 MFLOPS is always the bottleneck on a microcontroller, regardless of data movement."], "correct_index": 2}}, {"id": "tinyml-0388", "title": "The TinyML Tensor Arena Trap", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should you calculate the minimum tensor arena size for the keyword-spotting model's activation tensors?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 1}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["58 KB", "28 KB", "44 KB", "40 KB"], "correct_index": 2}}, {"id": "tinyml-0390", "title": "The Depthwise Separable Memory Saver", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many parameters are in a standard 3x3 convolution with 16 input channels and 32 filters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["288 parameters", "656 parameters", "4,608 parameters", "9,216 parameters"], "correct_index": 2}}, {"id": "tinyml-0391", "title": "The Power Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many times more energy does cloud transmission consume than on-device Cortex-M4 inference for one audio clip?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's about the same; compute costs are comparable to communication.", "2x more energy", "20x more energy", "100x more energy"], "correct_index": 2}}, {"id": "tinyml-0393", "title": "The Wake-Word Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which metric must you optimize to guarantee the wake-word system meets its hard real-time response deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Throughput in inferences/second", "Batch processing efficiency", "Single-inference end-to-end latency", "Average power consumption over one minute"], "correct_index": 2}}, {"id": "tinyml-0394", "title": "The Real-Time Audio Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the audio pipeline run stably in real time when frames arrive every 10 ms but inference takes 45 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, because the 45ms processing time is well under the typical 500ms application deadline for a keyword.", "Yes, if you batch 4 frames together, you can process them all at once.", "No, because the processing time (45ms) is greater than the data arrival interval (10ms).", "No, because the total latency per frame would be 55ms (45ms + 10ms), which is too high."], "correct_index": 2}}, {"id": "tinyml-0403", "title": "The TCO of Privacy: Federated vs. Centralized Data Upload", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which strategy is more economical for data transfer over a year, and what are the approximate costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The costs are comparable, with Federated Learning saving only a few thousand dollars (Centralized: ~$21,000 vs. Federated: ~$18,000).", "Federated Learning is vastly cheaper, costing about $18 per year compared to over $21,000 for the centralized approach.", "The centralized approach is cheaper, as the 100 KB daily model update is larger than the raw audio stream.", "The centralized approach costs about $2,600 per year, making it more expensive than Federated Learning, but still feasible."], "correct_index": 1}}, {"id": "tinyml-0406", "title": "The Hardware MAC Unit Misconception", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Are a Cortex-M4 and Cortex-M55 equally fast for ML just because both have a single-cycle hardware multiplier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0407", "title": "The Debug Interface Profiling Trap", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the debugger add 14 ms to your inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0408", "title": "The Hardware Divider Stall", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does `val / max_val` cost about 40 cycles inside a Cortex-M0+ normalization loop?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 0}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0409", "title": "The 16-bit MAC Overflow", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an `int16_t` accumulator corrupt a 128-element INT8 dot product on a 16-bit MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0410", "title": "The MAC Budget", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can you run inference in under 100ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, a 168 MHz processor does 1 MAC per cycle, completing 5M MACs in ~30 ms.", "No, standard integer execution takes 4 cycles per MAC, missing the budget at ~119 ms.", "Yes, but only by utilizing CMSIS-NN SIMD instructions (2 MACs/cycle), dropping latency to ~15 ms.", "No, even with SIMD it takes 2 cycles per MAC, taking ~60 ms."], "correct_index": 2}}, {"id": "tinyml-0411", "title": "The HVAC False Positive", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does an nRF5340 keyword model's false positive rate jump from 1% to 12% when HVAC cycles every 15 minutes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0412", "title": "The Watchdog Reset During Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a 280 ms RP2040 inference trip a 500 ms watchdog every few hours in the field?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 1}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0414", "title": "Bootloader A/B Partition Sizing", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should model size and runtime requirements shape the A/B OTA flash layout, and why do delta model updates help?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0415", "title": "Inference Cycles on Cortex-M4", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long should a 6M-MAC DS-CNN take on a 168 MHz scalar Cortex-M4 versus CMSIS-NN on Cortex-M4F?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 2}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0416", "title": "Cortex-M55 Helium Speedup for Depthwise Conv", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the speedup for a 3x3 depthwise convolution when migrating from Cortex-M4F to Cortex-M55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The M55 does 8 MACs/cycle vs the M4F's 2 MACs/cycle, and clocks slightly slower. The speedup is exactly (8/2) * (160/168) = 3.8x.", "Depthwise convolutions cannot be vectorized. The speedup is purely based on the clock speed ratio: 160/168 = 0.95x.", "The M4F wastes cycles on the odd 3x3 kernel size due to its 2-wide SIMD, while the M55's 16-wide vector processes spatial dimensions efficiently. The actual speedup is ~5.1x, exceeding the theoretical 3.8x ratio.", "The M55 has a dedicated hardware accelerator for depthwise convolutions, resulting in a fixed 10x speedup across all kernel sizes."], "correct_index": 2}}, {"id": "tinyml-0417", "title": "Interrupt Overhead Impact on Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total interrupt overhead during one 18ms inference and determine if it affects the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0418", "title": "The Zero-Point Shift Wreck", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the garbage output?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 32-bit accumulator is overflowing before the second layer runs.", "The intermediate 32-bit accumulator values are being used by the second layer without being correctly rescaled and shifted to the second layer's zero-point and scale.", "The ReLU6 activation function is not correctly implemented for INT8 inputs.", "The weights for the second layer were quantized using per-tensor instead of per-channel quantization."], "correct_index": 1}}, {"id": "tinyml-0420", "title": "The Keyword Spotting Memory Overflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much Flash does replacing the 128-to-128 standard convolution with a depthwise separable convolution save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The reduction is about 2x because it's a two-step process.", "It has no effect on Flash memory, only on activation size in SRAM.", "The reduction is approximately 8.4x, saving about 130 KB of Flash.", "The reduction is proportional to the kernel size, so it's a 9x reduction (3*3)."], "correct_index": 2}}, {"id": "tinyml-0421", "title": "The Inverted Residual Bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an inverted residual block likely faster than a classic residual block on the Cortex-M7 MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's faster because the expansion layer allows for more parallelism on the CPU.", "It's faster by reducing SRAM data traffic, as the large intermediate tensor from the expansion layer is kept in-register.", "It's not faster; the expansion layer increases FLOPs and will make the model slower.", "It's faster because it requires fewer multiply-accumulate operations overall."], "correct_index": 1}}, {"id": "tinyml-0423", "title": "The Mixed-Precision Memory Spike", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What transient memory spike causes the mixed-precision TFLM model to fail during interpreter initialization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The FP32 layer's weights are 4x larger, and the 256KB of SRAM is not enough to hold them during inference.", "The model's total activation memory now exceeds 256KB because one layer uses FP32 precision.", "The de-quantization step requires a temporary 64KB FP32 tensor to be created while the 16KB INT8 input tensor is still in memory, causing an 80KB transient spike.", "The FP32 operation causes memory fragmentation in the Tensor Arena, preventing a large enough contiguous block from being allocated."], "correct_index": 2}}, {"id": "tinyml-0424", "title": "The Silent Factory Floor", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given INT8's range of [-128, 127], what is the most likely cause of the model's failure in the noisy factory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has overfit to the clean lab data and cannot generalize to the noisy factory environment.", "The Cortex-M4's ~336 MFLOPS is insufficient to process the audio in real-time, causing missed events.", "The high-amplitude factory noise is causing activation values to exceed the INT8 maximum of +127, leading to saturation and information loss.", "The device's microphone is physically clipping the loud audio signal before it even reaches the model."], "correct_index": 2}}, {"id": "tinyml-0427", "title": "The Vision Transformer SRAM Overflow", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the proposed Vision Transformer architecture unsuitable for the 256 KB SRAM Cortex-M7 during runtime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The attention mechanism's FLOPs are too high, making it miss the latency deadline.", "The model's weights are too large to fit in the 256KB of SRAM.", "The quadratic scaling of attention creates intermediate activation tensors that overflow the 256KB of SRAM.", "The patch embedding layer requires 270 KB for floating point operations, crashing the MCU."], "correct_index": 2}}, {"id": "tinyml-0433", "title": "The Sensor Fusion Skew", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What hardware-induced data issue most likely explains the 30% accuracy drop after deploying the quantized sensor fusion model?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The INT8 quantization process has likely removed crucial features from the model weights.", "The MCU memory bandwidth is insufficient to load data quickly enough, causing data corruption.", "Training-serving skew from the production sensors introducing unmodeled jitter and bias shifts the input distribution.", "The device's power management unit is throttling the clock speed, leading to calculation timeouts."], "correct_index": 2}}, {"id": "tinyml-0434", "title": "The Unstable Keyword Augmentation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does uncontrolled noise mixing ruin the Cortex-M4 keyword `start` model clean accuracy and false-positive rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MFCC feature extraction is failing to process the complex augmented audio, creating garbage input vectors.", "The model is too small; a larger model is needed to learn from a more diverse and noisy dataset.", "The Cortex-M4's limited precision (no FPU) is causing numerical underflow when processing the low-energy noise signals.", "The data augmentation created unrealistic samples with uncontrolled Signal-to-Noise Ratios (SNR), corrupting the training data quality."], "correct_index": 3}}, {"id": "tinyml-0435", "title": "The Silent Misfire", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does quiet-office PTQ calibration make a Cortex-M4 KWS model misfire in busy-street noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Weight overflow occurred during the conversion process.", "The Cortex-M4 CPU has bugs in its INT8 processing instructions.", "The model's activations are clipping because the calibration data did not capture the full dynamic range of real-world inputs.", "The model requires FP32 precision and is too complex for INT8 quantization to ever work."], "correct_index": 2}}, {"id": "tinyml-0436", "title": "The Mixed-Precision Memory Budget", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "To meet the 454KB Flash budget while maximizing accuracy, which mixed-precision quantization strategy should you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Keep the CNN layers as FP32 and quantize the RNN layers to INT8.", "The model is too large and must be pruned or redesigned.", "Quantize the CNN layers to INT8 and convert the RNN layers to FP16.", "Quantize all layers to FP16."], "correct_index": 2}}, {"id": "tinyml-0437", "title": "The Micro-Convolution Budget", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What MAC reduction factor results from replacing the standard 3x3 64-to-128 convolution with a depthwise separable one?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 2}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It provides no computational savings, only parameter savings.", "Roughly a 2x reduction in MACs.", "Roughly an 8-9x reduction in MACs.", "Roughly a 64x reduction, proportional to the number of input channels."], "correct_index": 2}}, {"id": "tinyml-0438", "title": "The Transformer's Memory Spike", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this memory spike in the Transformer architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The sequence length N=1600 requires linear storage of ~6.4 KB, which fragments the heap.", "The intermediate N×N attention matrix requires ~10.24 MB of memory, scaling quadratically with sequence length.", "The 4x4 patch embedding creates a 160x160x16 tensor that consumes ~1.6 MB of SRAM.", "The parameter count dictates a 1.02 MB footprint, causing an off-by-one out-of-memory error."], "correct_index": 1}}, {"id": "tinyml-0439", "title": "The Power-Aware Architect", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose the flaw in this approach and propose a more physically accurate reward function?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The reward is fine; FLOPs are the main driver of power on microcontrollers.", "The reward should be `Accuracy / Parameters` to minimize flash size.", "The reward should model both compute and memory access energy costs, as data movement is a major power drain.", "The reward should be `Accuracy / Latency`, as faster models use less power."], "correct_index": 2}}, {"id": "tinyml-0442", "title": "The Privacy-Preserving Doorbell Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which training strategy has lower fleet data-transfer cost, centralized raw-audio upload or federated model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["On-device training for federated learning will drain the battery too quickly, making it infeasible compared to a simple upload.", "Centralized training is cheaper because cloud GPUs are more energy-efficient than on-device CPUs, leading to lower total energy consumption.", "The cloud cost is the dominant factor; centralized training is ~3.2x more expensive due to transferring 233 TB of raw data vs. 73 TB of model updates annually.", "Centralized training is 3.2x cheaper because you only send 233 GB of data per year compared to 730 GB for federated updates."], "correct_index": 2}}, {"id": "tinyml-0444", "title": "The Saturation Misfire", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the INT8 KWS model misfire on door slams when first-layer activations clip to 127?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4's computational power is insufficient, causing skipped samples during high-energy events.", "INT8 precision is inherently too low for audio tasks; the model must be deployed in FP16 or FP32.", "The quantization calibration range is too narrow due to an unrepresentative dataset, causing activation saturation.", "The model is overfitting to the training data and requires more dropout or regularization."], "correct_index": 2}}, {"id": "tinyml-0445", "title": "The Tensor Arena Hard Fault", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you avoid the 1.12 MB FP32 temporary from a 280 KB INT8 tensor on a limited SRAM budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply aggressive weight pruning to the largest layers to reduce the overall model size.", "Quantize the final layer to INT8, likely using QAT to preserve its accuracy.", "Re-architect the model to use smaller layers at the end of the network.", "Request a hardware change to a microcontroller with at least 1.5MB of SRAM."], "correct_index": 1}}, {"id": "tinyml-0446", "title": "The SRAM Budget Overflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does replacing the layer with a depthwise separable convolution solve peak activation SRAM usage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The parameter count is reduced by ~8.4x, so both memory and latency will decrease by ~8.4x, solving the problem.", "The layer is memory-bound; since parameters are stored in Flash, not SRAM, the change has no effect on the memory issue.", "The change reduces parameters and FLOPs by ~8.4x, but peak activation memory is unchanged at ~432 KiB; nevertheless, this already fits within the 2 MB SRAM.", "The FLOPs are reduced from ~339 MFLOPS to ~40 MFLOPS, but this increases latency because more, smaller operations are less efficient."], "correct_index": 2}}, {"id": "tinyml-0451", "title": "The Secure Doorbell A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What A/B testing strategy fits both the 200 KB and 450 KB models while preserving privacy and rollback safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Stream images to the cloud to run the new model server-side, allowing for rapid iteration without touching the device.", "Use a dual-partition OTA scheme, writing the new model to an inactive partition and swapping on boot for maximum safety.", "Overwrite the old model with the new one on 50% of devices; the risk of bricking is acceptable for a test rollout.", "Store both models concurrently in the available flash and use a runtime flag to switch between them, as there is sufficient space (278KB left for app logic)."], "correct_index": 3}}, {"id": "tinyml-0452", "title": "The ADC Overflow Anomaly", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do raw 12-bit I2S microphone values collapse an INT8 model calibrated on normalized [-1, 1] audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too complex and is running out of SRAM on the device, causing memory corruption.", "The Cortex-M4 CPU doesn't have the necessary SIMD instructions to correctly handle INT8 math, leading to calculation errors.", "The quantization scale is mismatched with the raw ADC data range, causing all inputs to clip to the INT8 max value.", "The I2S microphone's clock speed is out of sync with the MCU's, causing dropped bits and corrupted input frames."], "correct_index": 2}}, {"id": "tinyml-0453", "title": "The Cafeteria False Wake", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the mechanical reason the INT8 model fails so spectacularly in a noisy environment when the identical FP32 architecture was robust?", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cafeteria noise is causing power brownouts on the device, leading to random bit-flips in the model's weights.", "The calibration dataset lacked noisy examples, causing quantization to clip noisy activations and make them indistinguishable from the keyword.", "The FP32 model was likely overfitting to the clean dataset, and the INT8 model is simply exposing this pre-existing weakness.", "The INT8 model requires more SRAM than is available, and the stack is colliding with the heap, corrupting the activation tensors during inference."], "correct_index": 1}}, {"id": "tinyml-0454", "title": "The Depthwise Separable Switcheroo", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the parameter reduction to determine if replacing the 3x3 convolution with a depthwise separable convolution is a valid optimization.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's a bad trade-off; the parameter savings are minimal (~10-20%).", "It reduces parameters by about 50%, which is a good starting point.", "It provides a massive parameter reduction of ~85%, making it an excellent optimization strategy.", "It will not change the parameter count, it only reduces the required computation (FLOPs)."], "correct_index": 2}}, {"id": "tinyml-0455", "title": "The Vision Transformer Memory Trap", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which person-detection model fits a 512 KB Cortex-M7: MobileNetV2 activations or a ViT attention matrix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT, because it has fewer parameters and is more modern.", "The CNN, because the ViT's quadratic memory scaling in its MLP or attention block exceeds the MCU's SRAM.", "Both will fit easily, as 512KB is plenty of memory for either model architecture.", "Neither will fit; both CNNs and ViTs require multiple megabytes of SRAM for vision tasks."], "correct_index": 1}}, {"id": "tinyml-0456", "title": "The Neural Architecture Search Power Puzzle", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which NAS candidate meets the 2.5 mW average power budget once duty cycle is considered?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 1}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model C, because it has the lowest active power consumption (20mW).", "Model A, because its short inference time results in the lowest average power (2.0mW).", "Model B, as it offers the best balance between active power and inference time.", "None of the models meet the budget, as their active power all exceeds 2.5mW."], "correct_index": 1}}, {"id": "tinyml-0460", "title": "The Silent Saturation", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do loud keyword samples collapse an INT8 Cortex-M4 model from 95% to 60% accuracy after PTQ?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's tensor arena is overflowing the SRAM due to larger intermediate buffers required for INT8.", "Critical weight precision was lost during quantization, corrupting the model's core feature extractors.", "The calibration dataset lacks sufficient dynamic range, causing activation values to overflow the INT8 range on loud inputs.", "The Cortex-M4 lacks a floating-point unit, causing emulation errors when de-quantizing intermediate results."], "correct_index": 2}}, {"id": "tinyml-0461", "title": "The Mixed-Precision Power Budget", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you solve this trade-off using a mixed-precision approach given the constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use the fully INT8 model but perform data augmentation to make it more robust to high-g events.", "Keep the first two layers in FP32 and quantize the remaining 90% of the model to INT8.", "Underclock the Cortex-M7 when running the full FP32 model to fit the power budget.", "Implement the full model in INT16 to get a balance of precision and performance."], "correct_index": 1}}, {"id": "tinyml-0463", "title": "The Vision Transformer Memory Explosion", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 96x96 ViT with 2x2 patches overflow 1 MB SRAM on the first attention block?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT model has too many parameters to fit in 1MB of Flash.", "The self-attention mechanism requires storing a quadratically-scaling attention matrix in SRAM, which overflows the memory.", "The Cortex-M7 CPU does not have hardware acceleration for the softmax operation in the attention block.", "The parameter count exceeds SRAM capacity (21.2MB)."], "correct_index": 1}}, {"id": "tinyml-0464", "title": "The Power-Aware NAS Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which NAS candidate should be selected under the 1 mW average power constraint, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model A is the correct choice; its average power is ~0.51 mW (under budget), so we can prioritize its higher accuracy.", "Model B is the correct choice; its average power is ~0.21 mW, and you must always select the lowest power model regardless of the budget.", "Neither model works; Model A draws ~1.01 mW and Model B draws ~1.005 mW, both over the 1 mW budget.", "Both models work, but their power consumption is dominated by the 10 uW sleep state."], "correct_index": 0}}, {"id": "tinyml-0466", "title": "The Privacy-Power Trade-off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is transmitting 64 KB raw audio or spending 25 mJ to send a 2 KB summary more energy-efficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud Upload is more efficient; the on-device compute adds a 25mJ overhead that should be avoided.", "On-Device is 32x more efficient, since the data payload is reduced from 64KB to 2KB.", "On-Device is ~9.1x more efficient, since the total energy drops from 320mJ to 35mJ.", "Both options are roughly equivalent in energy cost once you factor in both compute and networking."], "correct_index": 2}}, {"id": "tinyml-0467", "title": "The Federated Thermostat A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the monthly net economic impact of federated learning for 1M thermostats after user savings, support savings, and server cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A net benefit of ~$180,000/month, reflecting user savings minus server costs.", "A net loss, because the $20,000/month server cost is a significant new expense.", "A net benefit of ~$230,000/month, from both user savings and reduced support tickets.", "A net benefit of ~$30,000/month, reflecting only the support ticket savings minus server costs."], "correct_index": 2}}, {"id": "tinyml-0468", "title": "The Noisy Kitchen Problem", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What most likely causes the INT8 keyword-spotting model to fail in the noisy kitchen despite strong lab accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is overfit to the clean training data and cannot generalize to noisy environments.", "The Cortex-M4's computational power is insufficient for running the CNN on noisy audio, causing missed inferences.", "The model's INT8 quantization range, calibrated on clean audio, is too narrow for the noisy kitchen environment, causing input values to saturate.", "The increased noise in the audio signal causes larger activation tensors, leading to an SRAM memory overflow in the tensor arena."], "correct_index": 2}}, {"id": "tinyml-0470", "title": "The Battery-Powered A/B Test", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which saves more battery for 1M smart doorbells: a 500 KB OTA model download over WiFi, or a 20 KB federated update followed by 15 s of on-device training at 100% CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["OTA Update; the 15 seconds of 100% CPU utilization for on-device training is the biggest power drain.", "Federated Fine-Tuning; it is more energy-efficient because the energy saved by the shorter radio time for the small upload far outweighs the energy spent on local computation.", "They are roughly equivalent; the total active time for both tasks is similar (~16 seconds), so the battery impact will be negligible.", "OTA Update; transferring 500 KB of verified firmware from the cloud is inherently more secure and reliable than running on-device training, justifying the energy cost."], "correct_index": 1}}, {"id": "tinyml-0471", "title": "The Real-Time Interrupt Stall", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling fault causes the vibration analysis to miss its 1 ms deadline despite sufficient MCU compute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The FFT model (0.3 MFLOPs) is too computationally expensive for the Cortex-M4 MCU to meet the 1ms deadline.", "The 2MB of available SRAM is insufficient to hold both the FFT model's tensors and the temperature task's state.", "The low-priority temperature check is preempting the high-priority vibration analysis, causing it to miss its deadline due to the combined execution time.", "The 168 MHz clock speed of the MCU is the bottleneck; upgrading to a 480 MHz Cortex-M7 would solve the problem."], "correct_index": 2}}, {"id": "tinyml-0473", "title": "The Silent Factory Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a Cortex-M4 INT8 KWS model clamp factory-floor activations to 127 after clean-speech PTQ calibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has a memory leak in the convolution operator, causing a buffer overflow.", "The Cortex-M4 CPU does not have enough computing power (MFLOPS) to execute the model in real-time.", "The calibration dataset used for PTQ did not represent the dynamic range of real-world audio inputs, causing activation overflow.", "The model should have been quantized to FP16 instead of INT8, as FP16 has higher precision."], "correct_index": 2}}, {"id": "tinyml-0474", "title": "The Doorbell Latency Crisis", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which architectural change most effectively solves this compute bottleneck based on quantitative analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Prune 50% of the filters in the standard convolution.", "Apply INT8 post-training quantization.", "Replace the standard convolution with a depthwise separable convolution.", "Replace the CNN layer with a small MobileViT-style attention block."], "correct_index": 2}}, {"id": "tinyml-0475", "title": "The Overwhelmed Sensor Fusion MCU", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does 11 ms of CPU work per 10 ms sensor window make the Cortex-M7 miss a 2 ms shutdown deadline?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 0}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M7's instruction cache is thrashing due to context switching between the audio and accelerometer tasks.", "The model is too large, causing an SRAM overflow which leads to system resets and missed data.", "The total required processing time (11ms) exceeds the data arrival interval (10ms), creating an unstable queue and growing latency.", "The SPI bus connecting the sensors to the MCU is saturated and cannot deliver the data fast enough."], "correct_index": 2}}, {"id": "tinyml-0476", "title": "The Desert Drone Reboot", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do Cortex-M7 delivery drones reboot when desert landing vision pushes a 150 ms pipeline past a 200 ms watchdog?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The desert heat is causing the MCU to thermally throttle, slowing down execution until it misses the watchdog deadline.", "The vision pipeline's latency has increased due to data drift from the new desert environment, exceeding the 200ms watchdog budget.", "A memory leak in the preprocessing code is causing `malloc` to occasionally take longer than 200ms, triggering the watchdog.", "The drone's power management system is unstable during landing, causing a voltage drop that resets the MCU."], "correct_index": 1}}, {"id": "tinyml-0477", "title": "The Silent Saturator", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What quantization failure mode explains loud-field input failures after calibration on quiet office audio?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 1}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4's DSP instructions have a bug with INT8 convolutions that manifests under high-magnitude inputs.", "General precision loss from the FP32-to-INT8 conversion is too severe, making the model inherently unstable.", "The calibration dataset was not representative of real-world inputs, causing activation overflow for loud signals.", "Weight overflow occurred during the initial quantization of the model's parameters, corrupting a key layer."], "correct_index": 2}}, {"id": "tinyml-0479", "title": "The PCIe-Powered Sensor", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which statement best identifies the primary bottleneck and core flaw of adding a PCIe accelerator to a TinyML sensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Transaction Layer Packet (TLP) overhead would add unacceptable latency compared to the direct memory access nature of the microcontroller's SPI bus.", "The microcontroller's GPIO pins cannot be physically configured to support the SerDes lanes required for the PCIe protocol, making it a driver and hardware compatibility issue.", "The power consumption of the PCIe interface is orders of magnitude too high for the device's battery-powered budget, and its bandwidth is unnecessary for a single audio stream.", "The main problem is that NVLink would be a better choice than PCIe for connecting the accelerator, as its higher bandwidth and GPU-centric design are better suited for ML workloads."], "correct_index": 2}}, {"id": "tinyml-0480", "title": "The Real-Time Queueing Cascade", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which event misses the 35 ms deadline when 20 ms vibration inferences arrive at 0, 10, and 20 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["All events will be processed on time, since the 20ms processing time is well under the 35ms deadline.", "The system will crash due to an SRAM overflow from having to buffer three events simultaneously.", "The third event misses its deadline because the queueing delay from the first two events pushes its completion time past its deadline.", "A real-time operating system (RTOS) would use preemption to pause the earlier events, ensuring all three deadlines are met."], "correct_index": 2}}, {"id": "tinyml-0481", "title": "The Silent Mic Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do loud clips collapse an INT8 KWS model to under 10% accuracy after quiet-audio calibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The activation memory required by the loud inputs is overflowing the device's 256KB SRAM, causing data corruption.", "The Cortex-M4's instruction set is failing to correctly execute the quantized operations, leading to arithmetic errors on loud inputs.", "The calibration dataset was not representative of real-world audio levels, causing activation values to clip at the INT8 maximum of 127.", "The model architecture is numerically unstable, and the smaller bit-width of INT8 magnifies pre-existing training issues."], "correct_index": 2}}, {"id": "tinyml-0482", "title": "The TinyML Transformer Trap", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 5x smaller Tiny-ViT OOM on a 256 KB Cortex-M4 when the 150 KB CNN fits perfectly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4 is too weak. Propose upgrading the hardware to a mobile-class chip that has enough RAM to support a Transformer architecture.", "The model conversion process is buggy. With 5x fewer parameters, the Transformer should be much smaller. Focus on debugging the TFLite conversion script.", "The model needs more aggressive quantization. Apply post-training INT4 or INT2 quantization to the Transformer to reduce its memory footprint until it fits.", "The Transformer O(N^2) self-attention matrix is too large for the SRAM. Replace the CNN standard convolutions with depthwise separable convolutions to improve efficiency and accuracy within the memory budget."], "correct_index": 3}}, {"id": "tinyml-0486", "title": "The TinyML Latency Crisis", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is replacing the bottleneck convolution with a depthwise separable convolution sufficient to meet the 100 ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the bottleneck on a Cortex-M4 is always memory bandwidth (low Ops/Byte ratio), so reducing FLOPs will not significantly impact latency.", "No, the reduction is only about 2x, bringing latency down to ~75ms, but adding the pointwise layer overhead will push it back over 100ms.", "Yes, the change reduces computation by ~8.4x, bringing the new total latency to ~41ms, which is within the 100ms budget.", "Yes, it will meet the deadline because parameter count is reduced by ~8.4x, which translates directly to a latency reduction of the same factor."], "correct_index": 2}}, {"id": "tinyml-0500", "title": "The Silent Activation Overflow", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this catastrophic accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The weight quantization process introduced too much error, shifting the model's decision boundary.", "The microcontroller's specific instruction set for INT8 convolutions has a bug, producing incorrect results.", "The calibration dataset was not representative of real-world audio, causing activation values to overflow the narrow INT8 dynamic range.", "The device has insufficient SRAM, causing the tensor arena memory to be corrupted during inference."], "correct_index": 2}}, {"id": "tinyml-0503", "title": "The Predictive Maintenance TCO Dilemma: Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Should Model B be deployed after accounting for failure savings and the added sensor energy cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Do not deploy. The new model adds over $300,000 in annual energy costs, making it too expensive.", "Deploy. The TCO decreases by approximately $200,000 annually due to the reduction in false positives.", "Deploy. The TCO decreases by approximately $4.2 million annually, as failure cost savings vastly outweigh the minimal increase in power cost.", "Do not deploy. The marginal improvement in accuracy does not justify the complexity of a federated learning deployment."], "correct_index": 2}}, {"id": "tinyml-0505", "title": "The Smart Doorbell's Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural replacement best addresses both latency and memory problems in the smart doorbell convolution layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply INT8 post-training quantization to the layer. It will cut the memory footprint by 4x and is simple to implement.", "Replace the layer with a small Vision Transformer block, as they are state-of-the-art for vision tasks.", "Replace the standard convolution with a 3x3 depthwise separable convolution to reduce computation by ~8.4x.", "Use unstructured weight pruning to remove 85% of the connections, as this will create a sparse and efficient layer."], "correct_index": 2}}, {"id": "tinyml-0508", "title": "The Keyword Spotting Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which path should you choose for the Cortex-M4 keyword spotting model, micro-ViT or NAS with depthwise separable CNNs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has too many parameters. Using depthwise separable convolutions reduces the parameter count, which will solve our problem by making the model smaller for the flash.", "The model is too slow. The ViT architecture is more modern and can be heavily quantized to INT4 to fit within the memory and latency budget.", "The bottleneck is peak SRAM usage (Tensor Arena). The ViT is infeasible due to its quadratic attention complexity. We must switch to depthwise separable convolutions to reduce computation by ~8x and the resulting activation memory footprint.", "The SRAM usage is too high. We should implement a Mixture-of-Experts (MoE) layer to ensure only a fraction of the model is executed per inference, which is the standard way to scale down large models."], "correct_index": 2}}, {"id": "tinyml-0511", "title": "The TinyML Keyword Spotting Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which candidate architecture should you choose under the 256 KB tensor arena and 100 ms latency constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0515", "title": "The Flash Memory Diet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is replacing the standard convolution with a depthwise separable convolution better than slashing output channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The junior engineer is right. Slashing output channels from 128 to 16 is the simplest and most direct way to achieve the required ~8x size reduction.", "Use a smaller kernel, like 1x1, instead of 3x3. This reduces parameters without changing the channel depth.", "Replace the standard convolution with a depthwise separable convolution. It achieves a similar parameter reduction while preserving the 128-channel feature depth, thus retaining higher model accuracy.", "Apply 4-bit integer quantization to the existing layer. This will reduce the model size by 4x, which is the most significant saving possible."], "correct_index": 2}}, {"id": "tinyml-0525", "title": "The Federated Wake-Word TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which yearly energy TCO is lower for improving the wake-word model, centralized audio upload or federated on-device training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The federated approach has a higher TCO because the 10-second training time is 10x longer than the 1-second upload time, leading to higher energy consumption.", "The centralized approach is overwhelmingly more expensive due to the massive cloud ingress and storage costs from 1 million devices, making the on-device energy cost irrelevant.", "The centralized approach has a higher energy TCO because the Wi-Fi radio's power draw (1W) is 20x higher than the MCU's (50mW), making its total energy per event 2x greater despite the shorter duration.", "The energy TCO difference is negligible as it amounts to fractions of a cent per device, so the decision should be based purely on privacy and implementation complexity, not economics."], "correct_index": 2}}, {"id": "tinyml-0526", "title": "The Silent Wake-Word Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the INT8 `Hey Lumi` model fail in car noise after PTQ calibration on quiet audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The calibration dataset did not include noisy, in-car audio, causing activation values to exceed the calibrated range and 'clip' during quantization.", "The Cortex-M4 microcontroller lacks the necessary DSP instructions to perform INT8 convolutions efficiently, corrupting the output.", "The model architecture uses a SiLU (Swish) activation function, which is non-saturating and is known to cause numerical instability during INT8 conversion.", "The weight values were corrupted during the FP32-to-INT8 conversion. The model must be retrained from scratch using Quantization-Aware Training (QAT)."], "correct_index": 0}}, {"id": "tinyml-0530", "title": "The Silent Overflow Catastrophe", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does clean-audio PTQ calibration make noisy +8.0 activations collapse a Cortex-M4 KWS model to random accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's architecture uses activation functions that are fundamentally incompatible with 8-bit integers.", "Weight quantization error was too high, destroying the model's learned representations across all layers.", "The calibration dataset was not representative, causing activation value clipping during inference on real-world data.", "The Cortex-M4's DSP instructions have a bug in the INT8 convolution kernel, leading to incorrect matrix multiplications."], "correct_index": 2}}, {"id": "tinyml-0533", "title": "The Factory Floor Cascade Failure", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the 80 MFLOP vibration model viable under the 50 ms hard real-time deadline on the Cortex-M7?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The event queue size is too small to handle the burst. Increasing the queue size would solve the problem.", "The Cortex-M7 is thermally throttling under the burst load, reducing its effective FLOPS.", "The per-event processing time (83.3ms) exceeds the real-time deadline (50ms), making the system unstable even for a single event.", "The average arrival rate (1 Hz) is much lower than the service rate (~12 Hz), so the issue is a software bug in the event handler, not a performance bottleneck."], "correct_index": 2}}, {"id": "tinyml-0534", "title": "The Silent Doorbell Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do loud close-range doorbell activations clamp to -128 in a 256 KB Flash INT8 keyword model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has overfit to the clean lab audio and cannot generalize to noisy field conditions.", "The microphone's analog-to-digital converter is clipping the raw audio waveform before it reaches the MCU.", "The calibration dataset for INT8 quantization was not representative, causing activation values from loud sounds to overflow.", "The Tensor Arena in SRAM is too small, and the larger activations from loud sounds are causing memory corruption."], "correct_index": 2}}, {"id": "tinyml-0535", "title": "The Keyword Spotting Memory Blowout", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What FP32 parameter memory savings result from replacing the standard 3x3 convolution with a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The standard convolution requires ~72 KB, while the depthwise separable version requires only ~1.1 KB, an almost 66x reduction.", "The device is compute-bound, not memory-bound. A Cortex-M4 cannot execute this many FLOPs, so changing convolution type is irrelevant.", "The standard convolution requires ~72 KB for parameters, while the depthwise separable version requires ~9.1 KB, an almost 8x reduction.", "The standard convolution requires ~18 KB. The depthwise separable version requires ~2.3 KB. The savings are not significant enough."], "correct_index": 2}}, {"id": "tinyml-0537", "title": "The Smart Doorbell's Update Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which monthly update plan has lower TCO for 1M doorbells: raw-audio OTA or federated gradients?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 2}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized OTA, because the on-device compute cost of FL would be too high and drain the battery.", "Centralized OTA, because it's more secure; a signed firmware image prevents tampering, unlike FL which is vulnerable to data poisoning.", "Federated Learning, because it has a significantly lower data transfer TCO and preserves user privacy by not uploading raw audio.", "The annual costs for both are under $500, so they are negligible. Choose Centralized OTA for its simpler implementation."], "correct_index": 2}}, {"id": "tinyml-0539", "title": "The Saturated Microphone", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you fix INT8 activation saturation when noisy audio drives first-layer values from [-10,10] to [-90,90]?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Add a clipping function before the first layer to cap activations at 10.0.", "Recompile the model using FP16 precision for the first layer (mixed precision).", "Re-run the quantization calibration with a dataset that includes noisy audio samples.", "Increase the number of channels in the first convolutional layer to better capture features."], "correct_index": 2}}, {"id": "tinyml-0540", "title": "The Transformer's Memory Trap", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the Transformer architecture riskier than the CNN under the Cortex-M4's 256 KB SRAM budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Transformer's MAC count is too high for the Cortex-M4's clock speed, causing a latency violation.", "Transformer runtimes have a large fixed memory overhead, which when added to the 150KB of parameters, exceeds the 256KB budget.", "The activation memory scales quadratically with sequence length due to the self-attention matrix, creating a single large tensor that consumes the SRAM budget.", "The attention mechanism requires FP32 weights to function correctly, making the true parameter size 600KB (150K * 4)."], "correct_index": 2}}, {"id": "tinyml-0545", "title": "The Dual-Keyword Memory Overflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural change should you make when doubling KWS output channels threatens the Cortex-M4 SRAM budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The activation memory is the bottleneck. The input audio features must be downsampled to a lower resolution.", "The Cortex-M4 is not powerful enough. The device requires a hardware upgrade to a Cortex-M7 with more SRAM.", "Replace the standard convolution with a depthwise separable convolution to increase capacity while reducing parameter count.", "Keep the larger standard convolution but apply 80% unstructured weight pruning to fit it into memory."], "correct_index": 2}}, {"id": "tinyml-0555", "title": "The Clock Tree Surprise", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How is running slower using *more* energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 16 MHz clock reduces dynamic power proportionally to 10 mW, yielding a total energy of 1.0 mJ per inference, saving battery life.", "Running at 16 MHz uses 1.5 mJ per inference because dynamic power scales linearly with frequency but active time scales inversely, while accumulating 5x more leakage energy.", "Static (leakage) power is voltage-dependent. At 16 MHz the MCU stays active 5x longer (100 ms vs 20 ms), accumulating 1.51 mJ total energy compared to 1.01 mJ for race-to-sleep.", "The 16 MHz clock reduces the active power to 15 mW, saving 35 mW, and yielding a lower total energy of 0.3 mJ per inference."], "correct_index": 2}}, {"id": "tinyml-0556", "title": "The Branch Prediction Penalty on MCU", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a branch-based INT8 ReLU consume ~38% of Cortex-M4 inference time for 500,000 activations?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 1}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4 has a branch predictor that mispredicts ReLU branches 50% of the time, causing 10-cycle pipeline flushes on each miss.", "The Cortex-M4 has no dynamic branch predictor. With ~50% negative activations, every taken branch flushes the 3-stage pipeline (1-3 penalty cycles per element). For 500K activations, this wastes ~750K cycles. Fix: use a branchless USAT instruction or CMSIS-NN's vectorized SIMD ReLU for up to 10x speedup.", "The compiler is generating floating-point comparison instructions for the INT8 values, triggering a software emulation trap on each comparison since the M4 has no FPU.", "The ReLU function is memory-bound because each activation requires a cache miss to load from SRAM, and the Cortex-M4's single-cycle SRAM interface cannot keep up."], "correct_index": 1}}, {"id": "tinyml-0561", "title": "The DMA Pipeline for Sensor Data", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does single-buffer SPI DMA at 1.6 kHz drop samples during 30-50 ms Cortex-M4 inference, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0563", "title": "The Lookup Table Optimization", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the memory cost, the speedup, and when this optimization breaks down?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0564", "title": "The RP2040 Dual-Core ML", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can two 133 MHz RP2040 cores halve an 80 ms audio classifier, or does SRAM bus contention limit speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0565", "title": "The nRF5340 Network Core Split", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you partition memory and processing, and what happens if the BLE stack on the network core needs to interrupt the application core mid-inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0566", "title": "The Mel Spectrogram Compute Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Does 40-bin Mel feature extraction on a 168 MHz Cortex-M4 fit beside a 15 ms model in a 100 ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0573", "title": "On-Device Data Collection for Retraining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How much triggered anomaly data can 2,000 Cortex-M4 vibration sensors store in 114 KB flash and upload over BLE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use raw continuous logging: stream 6.4 KB/s of sensor data to flash, filling 114 KB in 17.8 seconds, then upload via BLE. This gives the ML team the full fidelity data they need for retraining.", "Use triggered + compressed logging: record only anomaly events (800 bytes/event, ~10/day = 8 KB/day). Flash holds 14 days of data. BLE upload at 60 KB/s takes 1.9s per device. For 2,000 devices through 10 gateways: 6.3 minutes total. This yields 284,000 labeled edge cases (227 MB) despite each device having only 114 KB of logging space.", "Upgrade to a microcontroller with more flash (16 MB) to store weeks of raw sensor data before uploading.", "Use federated learning to retrain the model on-device, eliminating the need to collect and upload any raw sensor data to the cloud."], "correct_index": 1}}, {"id": "tinyml-0578", "title": "The I2C Clock Stretching Deadlock", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a crashed I2C sensor hold SCL low forever and freeze an MCU in a wait loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0584", "title": "Cortex-M55 + Ethos-U55 + Cortex-A32 — Which Core Runs What?", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which Alif Ensemble E7 compute element should run detection, tracking, and H.264 streaming, and why?", "chain_ids": ["tinyml-chain-auto-016-08"], "chain_positions": {"tinyml-chain-auto-016-08": 0}, "chain_tiers": {"tinyml-chain-auto-016-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run all three tasks on the Cortex-A32 since it has the highest clock speed, Linux support, and can multitask with threads. The NPU and M55 add unnecessary complexity.", "Assign person detection to Ethos-U55 (128 MACs/cycle, 20.9ms, 20 mW — 357x more energy-efficient than A32 for this workload). Assign H.264 encoding to Cortex-A32 (needs MMU + caches for codec complexity, 30ms at 200 mW). Assign object tracking to Cortex-M55 (lightweight Kalman filter, 1ms at 15 mW). This heterogeneous split saves ~2.24W vs running everything on A32.", "Assign all ML tasks (detection + tracking) to the Ethos-U55 NPU and use the A32 only for H.264 encoding. The M55 should remain idle to save power.", "Assign person detection to Cortex-M55 with Helium (it has SIMD for CNNs), H.264 to Cortex-A32, and use the Ethos-U55 NPU for tracking since it can maintain state across frames."], "correct_index": 1}}, {"id": "tinyml-0585", "title": "Updating a 500 KB Model Over BLE 5.0", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does the model's quantization format (e.g., FP32 vs INT8) affect the OTA size, and why does this become a deployment bandwidth trade-off?", "chain_ids": ["tinyml-chain-auto-secondary-004-11"], "chain_positions": {"tinyml-chain-auto-secondary-004-11": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantization format doesn't affect OTA size because the model is compressed during transmission regardless of the weight precision.", "INT8 quantization reduces OTA model size by 4x (400 KB to 100 KB), cutting BLE transfer time from 8s to 2s and energy per update from 211 mJ to 52.8 mJ.", "The OTA size reduction from quantization is irrelevant because BLE 5.0's built-in compression handles the FP32 model efficiently.", "INT8 reduces OTA size by 2x (not 4x) because the TFLite flatbuffer format adds metadata overhead that scales with the number of parameters, partially negating the precision reduction."], "correct_index": 1}}, {"id": "tinyml-0587", "title": "Duty Cycle for Energy Harvesting Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the maximum inference rate (inferences per minute) the energy budget supports?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 1}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Maximum rate: 200 uW / 30 mW = 0.67% duty cycle. At 12ms per inference: 0.0067 x 60,000ms/min / 12ms = 33 inferences/min.", "Energy per cycle: inference (360 uJ) + sensor (10 uJ) + BLE TX (800 uJ) = 1,170 uJ. Effective harvest: 200 uW x 70% = 140 uW. Period: 1,170/140 = 8.36s. Rate: ~7/min. BLE dominates (68%). Optimization: batch 10 results per BLE packet, amortizing BLE to 80 uJ/inference. New rate: ~18/min (2.5x improvement).", "The vibration harvester cannot sustain any inferences because 200 uW is insufficient to power the Cortex-M4F's minimum active current of 30 mW.", "Maximum rate: 60 inferences/min. The harvester generates 200 uW continuously, and inference costs only 360 uJ, so the energy budget is limited solely by the 12ms inference time."], "correct_index": 1}}, {"id": "tinyml-0588", "title": "The Floating Point Sensor Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does float scaling an integer sensor stream cost more than the first three INT8 layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0591", "title": "The Factory Floor Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What quantization failure explains the factory-floor accuracy collapse, and what mixed-precision fix fits the SRAM budget?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 2}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0592", "title": "The Ghost in the Microcontroller", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the intermittent close-range radar detection failure in the INT8 automotive model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0594", "title": "The Saturation Catastrophe", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What calibration problem causes the INT8 keyword-spotting model to collapse in noisy factory audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0596", "title": "The Silent Drift Catastrophe", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What cascading failure is caused by checkpointing adaptive normalization statistics to Flash every 100 inferences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0597", "title": "The Instruction Cache Thrashing Loop", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where do the missing 3.1 ms go when a Cortex-M7 depthwise convolution takes 4.2 ms instead of 1.1 ms?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 3}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0601", "title": "The Watchdog Interrupt Starvation", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does moving an 800ms Cortex-M4 anomaly model into a timer ISR make a 500ms watchdog reset the device?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 3}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0602", "title": "The SRAM Bank Collision", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does camera DMA into SRAM1 slow Cortex-M7 inference by 25% even though DMA uses zero CPU cycles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0603", "title": "The MCU Throughput Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 168 MHz Cortex-M4 sustain 10 inferences per second for a 10M-MAC model while serving sensors and UART?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 4}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0604", "title": "The Operator Support Gap", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should MobileNetV2 on a Cortex-M4 replace Resize Bilinear, Pad, and Swish to keep CMSIS-NN coverage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0605", "title": "The MAX78000 CNN Accelerator", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you explain architecturally why the accelerator achieves 30-100x better energy efficiency, and identify what workloads it cannot accelerate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0607", "title": "The Energy Harvesting Inference Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many inferences per hour can you sustain, and what happens during a cloudy day when light drops to 50 lux?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 2}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["At 500 lux: 27 inferences/hour. At 50 lux: 0 inferences/hour — the 1 uW harvest exactly equals sleep power. Breakeven for 1 inference/hr: ~67 lux.", "The solar cell generates enough energy at 50 lux for 5 inferences/hour.", "At 500 lux: 30 inferences/hour (36 mJ / 1.2 mJ).", "The system can sustain 27 inferences/hour regardless of light level."], "correct_index": 0}}, {"id": "tinyml-0610", "title": "Fusing Accelerometer + Microphone + Temperature on One MCU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design the memory layout, DMA strategy, and scheduling for three sensor branches in 256 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run three separate models sequentially, each with its own 120 KB tensor arena and 80 KB weights in SRAM, totaling 360 KB SRAM for arenas alone.", "Use a shared-backbone multi-head architecture: shared backbone (100 KB flash) + 3 small heads (30 KB total flash). Single tensor arena: 180 KB SRAM. DMA strategy: mic uses double-buffer (16 kHz continuous), accelerometer uses single-buffer (1 kHz bursts), temp uses polled I2C (1 Hz). Total SRAM: 240 KB / 256 KB. This avoids the 360 KB SRAM requirement of three separate models.", "Use external PSRAM to hold all three tensor arenas, accessing them via SPI. The 2-3x latency penalty is acceptable since the temperature model only runs once per second.", "Time-multiplex a single model across all three sensor streams by reloading different weights from flash for each sensor, running the mic model at 62.5 Hz, accelerometer at 1 Hz, and temperature at 0.017 Hz."], "correct_index": 1}}, {"id": "tinyml-0611", "title": "Co-Designing a TinyML Accelerator", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What sub-0.5 mm² INT8 accelerator blocks would beat CMSIS-NN on a Cortex-M4 by 10x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Build a mini-GPU with 16 streaming multiprocessors (0.4 mm²) to handle all floating-point and integer ops.", "Build a weight-stationary systolic array with 64 INT8 MACs, 32KB weight SRAM, and 8KB activation SRAM (~0.38 mm²). Leave activations, pooling, and control flow to the M4.", "Add a 512KB L2 cache (0.45 mm²) to the Cortex-M4 to solve the CMSIS-NN memory bandwidth bottleneck.", "Implement a dedicated FP16 matrix multiplication unit (0.2 mm²) that offloads the entire network execution from the M4."], "correct_index": 1}}, {"id": "tinyml-0613", "title": "NPU Delegation Coverage Determines Actual Speedup", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What speedup should MobileNetV2 expect when 18 of 22 layers, but 95% of compute, delegate to Ethos-U55?", "chain_ids": ["tinyml-chain-auto-016-08"], "chain_positions": {"tinyml-chain-auto-016-08": 1}, "chain_tiers": {"tinyml-chain-auto-016-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Speedup is ~5x as expected. 82% of layers on the NPU means 82% of compute is accelerated, giving 1/(1-0.82) = 5.6x by Amdahl's Law.", "Actual speedup is ~6.8x (not 5x or 20x). Layer count (82%) misleads — compute delegation is 95% (28.5M/30M MACs). Naive Amdahl's predicts 1/0.05 = 20x, but each NPU-to-CPU fallback transition costs ~50 us of data transfer dead time. With 4 transitions: ~200 us overhead. After fixing the unsupported Mean op (reducing to 2 transitions): speedup improves to ~9.4x.", "Speedup is ~20x because 95% of compute MACs run on the NPU, which is 20x faster than the M55 CPU per MAC.", "Speedup is only ~2x because the Ethos-U55's 128 MACs/cycle throughput is bottlenecked by the narrow data bus between the NPU and M55, limiting effective bandwidth to 10% of peak."], "correct_index": 1}}, {"id": "tinyml-0614", "title": "Sub-threshold Voltage Operation — Power vs Speed Trade-off", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the Apollo4 meet the 200ms deadline, and what is the energy savings compared to a standard Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Apollo4 cannot meet the 200ms deadline because sub-threshold operation at 0.5V reduces the maximum clock to ~10 MHz, making inference take over 1 second.", "The Apollo4 at 0.5V/96 MHz meets the deadline easily: 1M MACs at 2 MACs/cycle = 5.2ms inference (194.8ms headroom). Power: 3 uW/MHz x 96 = 288 uW (vs standard M4 at 50 mW). Energy per inference: 1.5 uJ vs 150 uJ — 100x more efficient. Sub-threshold operation is transformative when the workload fits within the reduced clock speed's deadline.", "Power scales as V^2, so 1.2V to 0.5V gives (1.2/0.5)^2 = 5.76x power reduction at the same 168 MHz clock speed, and the Apollo4 meets the deadline with a 5.76x power savings.", "The Apollo4 meets the deadline but the power savings is only 2x because sub-threshold operation increases leakage current, which offsets most of the dynamic power reduction."], "correct_index": 1}}, {"id": "tinyml-0617", "title": "The Input-Dependent Watchdog Reset", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did an optimization that improved average-case compute latency lead to a catastrophic, input-dependent failure?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 2}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0618", "title": "The Watchdog Reboot Loop", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might the system be failing in this non-linear way, and why is the playbook's recommendation likely wrong for this specific failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0622", "title": "The Checkpoint-Watchdog Death Spiral", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the watchdog and checkpoint design enter a deterministic reboot loop during cold-start inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0623", "title": "The OTA Wear-Out Catastrophe", "topic": "ota-firmware-updates", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you assess this OTA design and predict the specific, time-delayed physical failure mode that caused this mass-bricking event?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0627", "title": "The Night-Rain Quantization Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might it be a dangerous trap, and what more robust system-level solution should you justify to your team?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0628", "title": "The Watchdog and the Unseen Workload", "topic": "monitoring-observability", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the root causes of a localized inference latency increase causing watchdog reboots?", "chain_ids": ["tinyml-chain-auto-secondary-017-67"], "chain_positions": {"tinyml-chain-auto-secondary-017-67": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-67": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0629", "title": "The Fault-Tolerant Battery Killer", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What flash-write energy miscalculation causes the checkpointing sensors to drain batteries in under 3 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0634", "title": "The Deaf Automobile Watchdog", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is simply enabling a watchdog timer an insufficient fix for the vehicle voice assistant going deaf?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A watchdog timer requires a dedicated RTOS to function correctly, which is missing from this bare-metal system.", "The system is experiencing unmonitored data drift. Bad inputs cause NaN/Inf propagation that hangs the model. A watchdog just creates an infinite reboot loop.", "A 2-second watchdog is too short for an automotive environment and will frequently trigger during normal operations like a cold engine start.", "The watchdog resets the entire vehicle's CAN bus, which violates automotive safety integrity level (ASIL) standards."], "correct_index": 1}}, {"id": "tinyml-0635", "title": "The Silent Sensor Death Spiral", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would a model update that passed memory checks cause a catastrophic watchdog loop, and why would it only affect a small subset of the fleet?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 4}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0636", "title": "The Federated Learning Battery Drain Catastrophe", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did a modest increase in model size and accuracy lead to a catastrophic, non-linear failure in battery life, and what critical TCO factor did the team's plan ignore?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 3}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0640", "title": "The Hypersensitive Wake-Word", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What dynamic range mismatch causes the INT8 wake-word model to become hypersensitive to loud background noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0641", "title": "The Watchdog Boot Loop Catastrophe", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is the proposal to lengthen the watchdog timeout likely wrong, what is the probable root cause of the failure, and what catastrophic, non-linear fleet behavior is this masking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0643", "title": "The Keyword Spotting Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the complete inference pipeline for an always-on keyword spotting system on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0644", "title": "The SIMD Lane Starvation", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why do Cortex-M55 Helium SIMD instructions yield only a 1.6x speedup when memory is accessed via byte-by-byte loads?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 5}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M55's Helium unit has a 4-cycle latency for vector multiply-accumulate instructions, so processing 4 elements takes 4 cycles — only marginally better than scalar's 4 cycles.", "The SIMD execution unit was starved by scalar memory loads. The CPU can multiply 4 INT8 values in 1 cycle, but loading them byte-by-byte (LDRB) takes 4 cycles. Fix: cast 8-bit pointers to 32-bit aligned pointers for single-cycle word loads, then feed the SIMD unit. Proper aligned loads achieve the full 4x speedup; scalar loads limit it to 1.6x.", "The Cortex-M55 compiler auto-vectorizes the scalar loop identically to the SIMD intrinsics, so there is no performance difference between the two implementations.", "The 1.6x speedup is correct and expected — SIMD on Cortex-M only helps with floating-point operations, not 8-bit integer arithmetic."], "correct_index": 1}}, {"id": "tinyml-0645", "title": "The Cache-Line False Sharing", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does Core 0's Cortex-M7 ML inference slow by 30% when Core 1 writes independent SRAM variables?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0646", "title": "The MCU Roofline", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Where is the ridge point, and what does it tell you about which models are feasible?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 4}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ridge point is at ~100 Ops/Byte (similar to GPUs), meaning most TinyML models are memory-bound.", "The ridge point is 2.0 Ops/Byte. Conv2D (~18 Ops/Byte) and depthwise (~9 Ops/Byte) are above it, making them compute-bound.", "The roofline model doesn't apply to MCUs because they lack a cache hierarchy.", "The ridge point is at ~0.1 Ops/Byte because MCU memory bandwidth far exceeds compute throughput."], "correct_index": 1}}, {"id": "tinyml-0647", "title": "The MCU NAS Search Space", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What constraints must your search space encode that a standard NAS for desktop/cloud would ignore?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 2}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0649", "title": "MCUNet Search Space Design", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design the search space to find a competitive model in 6x less time?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 3}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0650", "title": "Sub-Milliwatt Always-On Wake Word Detection", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architecture keeps always-on wake-word detection under 1 mW including microphone, ADC, features, and inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a Cortex-M4 at the lowest clock speed (4 MHz) to stay under 1 mW. At 4 MHz, the MCU draws ~1.5 mW active, plus 0.5 mW mic + 0.2 mW ADC = 2.2 mW total.", "A tiered always-on architecture: Stage 1 — analog VAD or ultra-low-power DSP (~20-50 uW) screens for speech-like audio 100% of the time. Stage 2 — lightweight feature extractor (~200 uW) activates only when VAD triggers (~10% of time). Stage 3 — full neural network (~1.5 mW) runs only on likely wake words (~1% of time). Average power: ~85 uW.", "Use a standard Cortex-M4 running the full neural network continuously but power-gate the microphone between inference cycles to stay under 1 mW.", "Achieve sub-milliwatt by using INT4 quantization to reduce the model's compute by 4x, bringing the M4's inference power from 2 mW to 0.5 mW."], "correct_index": 1}}, {"id": "tinyml-0652", "title": "The Float-to-Int Hardware Trap", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why is the hardware FPU ignoring your code?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 3}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0654", "title": "The Contextual Awareness Crash", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What memory management architecture avoids the TFLite Micro single-arena OOM when switching between wake-word and speaker ID models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0655", "title": "The Zero-Copy Race Condition", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a complete, robust data flow architecture that achieves a safe, zero-copy pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0657", "title": "The DMA Energy Break-Even Point", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you calculate the DMA-vs-CPU-copy energy break-even transfer size for a 9,216-byte camera frame on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0658", "title": "The Siren's Screech: Designing a Robust Hearing Aid", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What mixed-precision deployment strategy keeps the hearing-aid model within the SRAM and latency budgets while avoiding overflow?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 4}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0663", "title": "The Conversational Doorbell's Memory Deficit", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can a Cortex-M7 doorbell support a 256-token transformer KV cache when SRAM needs exceed 512 KB?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 5}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0667", "title": "The Look-aside Attention Cache", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three design decisions for a look-aside KV-cache architecture on a 512KB SRAM MCU, and how do you justify them quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0670", "title": "The Ghost in the Dashboard", "topic": "monitoring-observability", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What three-part on-device architecture can detect humid-climate audio drift, disable the model, and store a diagnostic fingerprint?", "chain_ids": ["tinyml-chain-auto-secondary-017-67"], "chain_positions": {"tinyml-chain-auto-secondary-017-67": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-017-67": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0671", "title": "The Redundant Vision Failure", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can two Cortex-M7 vision MCUs detect silent weight corruption and hot-reload within a 100 ms frame budget using only on-chip resources?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0672", "title": "The Automotive Sensor Fusion Overflow", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Propose a quantization strategy for the two branches of the network, and determine what the dominant factor in your energy budget calculation is: compute or memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0673", "title": "The Streaming Sensor Fusion Dilemma", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three architectural decisions, and justify them with quantitative analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply virtual memory by using external flash as swap space, paging KV-cache layers in and out as needed, similar to how mobile operating systems handle memory pressure.", "Three architectural decisions: (1) Reduce context from 512 to 64 tokens via sliding window (8x KV-cache reduction). (2) Layer-by-layer DMA tiling: keep only 1 layer's INT8 KV-cache in SRAM (~16 KB) while prefetching the next layer from flash via double-buffered DMA. (3) Reduce to 4 layers + INT8 cache to meet the 1ms deadline (0.16ms/layer x 4 = 0.64ms). Final SRAM: ~96 KB (16 KB DMA + 64 KB activations/weights + 16 KB working). The key: treat flash as a managed memory tier with explicit software-controlled streaming.", "Quantize the entire model from FP16 to INT4, reducing the KV-cache from 4 MB to 1 MB. Then use aggressive structured pruning to remove 75% of attention heads, bringing it to 256 KB.", "The junior team is correct — it is impossible. A transformer with 16 layers and 512-token KV-cache fundamentally cannot run on a 256 KB SRAM MCU. The requirement should be changed to a simpler RNN-based model."], "correct_index": 1}}, {"id": "tinyml-0674", "title": "The Silent Failure of the In-Car AI", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three architectural decisions, and how do you justify them with quantitative analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0675", "title": "The Concurrent Wake-Word Crisis", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the scheduler and tensor arenas be designed so two TinyML models can run without additive SRAM residency?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 3}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0678", "title": "The Private Factory Floor", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can federated learning reduce false positives for FactorySense while staying within Cortex-M4 battery and business constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0680", "title": "The Sentient Fleet's Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid learning strategy balances centralized data cost, federated privacy, and on-device power for the 1 million vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0681", "title": "The Silent Failure of the Emergency Keyword", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What root-cause analysis and mitigation plan should be used after INT8 quantization makes the emergency KWS model fail in noise?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 3}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0682", "title": "The Federated Wearable ROI Proposal", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What ROI and power analysis justifies or limits federated learning for the smart tremor-detection patch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0683", "title": "The TinyML Conversational AI Memory Wall", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system handle conversational context and follow-up questions within the strict 480 KB SRAM and 2 MB Flash constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0684", "title": "The Billion-Dollar Doorbell Breach", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What phased response should mitigate the ML accelerator side-channel vulnerability without a physical recall?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 3}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0685", "title": "The Dusk Disaster: Quantization-Aware Architecture", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What quantization-aware architecture plan should address the AEB model's dusk failures within MCU memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0686", "title": "The Sun-Blinded Driver Monitor", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you propose to detect this occlusion on-device, and what is the recovery strategy without violating the 1ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0689", "title": "The Guardian-Node Dilemma: TCO vs. Security for an On-Device Learning System", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid update strategy balances TCO and security for Guardian-Node devices without a secure element?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0696", "title": "The Flat Memory Reality", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why should a Cortex-M4 not `malloc` activation buffers from a 256 KB bare-metal SRAM heap during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0697", "title": "The Memory-Mapped Sensor Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does busy-wait SPI polling make Cortex-M0+ vibration inference 3x slower than expected?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 1}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0698", "title": "The Flash-SRAM Boundary", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can Cortex-M4 weights stay in 1 MB flash while activations must fit in 256 KB SRAM?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 2}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0699", "title": "The Stack vs Heap on MCU", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does TFLite Micro require a static tensor arena instead of malloc on a small Cortex-M0+, and how does dynamic allocation affect WCET analysis?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 2}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0700", "title": "The TFLite Micro Arena Sizing", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you determine the minimum arena size without trial and error, and why is it not simply the sum of all tensor sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0702", "title": "SRAM Needed for MobileNet Activations", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you estimate the peak SRAM required for the activation tensors and determine if the model fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 3.4 MB of parameters don't fit in 512 KB SRAM, so the model cannot run on this hardware without significant compression.", "\"Parameters (3.4 MB) live in flash, not SRAM. SRAM holds activations only. Peak activation at 128x128 INT8 MobileNetV2 is 448 KB (expansion layer input 64 KB + output 384 KB). With 80 KB firmware overhead, available SRAM is 432 KB — deficit of 16 KB. Fix: use width multiplier 0.75x, reducing peak to 336 KB (fits with 96 KB headroom) at only 2% accuracy loss (69.8% vs 71.8%).\"", "MobileNetV2 fits easily because INT8 quantization reduces activation memory by 4x compared to FP32, bringing peak SRAM to ~108 KB.", "The model fits if you use in-place operations for all depthwise convolution layers, which eliminates the need for separate input and output activation buffers."], "correct_index": 1}}, {"id": "tinyml-0703", "title": "Flash Wear from Logging Frequency", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long will 2 MB of 100K-cycle external NOR flash last when logging 32 bytes at 10 inferences per second?", "chain_ids": ["tinyml-chain-auto-009-06"], "chain_positions": {"tinyml-chain-auto-009-06": 0}, "chain_tiers": {"tinyml-chain-auto-009-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["19.8 years (calculating 200 GB total divided by 27.6 MB/day).", "~2.8 hours because naive metadata pointer updates erase a 4 KB sector on every write.", "Indefinitely, because modern NOR flash has built-in wear leveling.", "17.4 hours until the flash fills up capacity-wise."], "correct_index": 1}}, {"id": "tinyml-0704", "title": "The FreeRTOS Heap Exhaustion", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did spawning and deleting an ML task per image cause heap exhaustion despite having plenty of SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0705", "title": "The Unaligned Access Fault", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does TFLite Micro's packed tensor memory layout cause unaligned access on the Cortex-M0+, and why does the ISA constraint force you to pad your ML tensors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0706", "title": "The DMA Channel Collision", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does the ML inference's DMA bandwidth requirement conflict with the sensor DMA, and how can priority inversion resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0707", "title": "The TFLite Micro Heap Overhead", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does `AllocateTensors()` fail when a 32 KB MCU gives TFLite Micro exactly a 15 KB tensor arena?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 3}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0708", "title": "The CMSIS-NN Alignment Fault", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an unaligned `int8_t tensor_arena[20000]` crash CMSIS-NN with a Hard Fault?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0709", "title": "The Tensor Arena Overflow", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 210 KB tensor arena fit when a Cortex-M4 has only 200 KB of SRAM available?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 3}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0710", "title": "The Double Buffering DMA Strategy", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a DMA double-buffering scheme and prove mathematically that it eliminates data loss?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 2}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a large circular buffer (32 KB) in SRAM. The CPU will always catch up because inference (40ms) is faster than the fill time. No synchronization needed.", "Allocate two 3,072-byte buffers (512 samples x 6 bytes/sample). Configure STM32 DMA in double-buffer mode (DBM bit). DMA fills Buffer A while CPU processes Buffer B, then swap. Fill time: 160ms. Process time: 40ms. Utilization: 25%. This mathematically guarantees no data loss because process time < fill time, and the buffers are hardware-isolated — DMA physically cannot write to the buffer the CPU is reading.", "Use a single buffer with an interrupt flag: the DMA sets a flag when complete, and the CPU polls the flag before reading. This prevents corruption because the CPU never reads during an active DMA transfer.", "Disable DMA during inference and re-enable it after. The 40ms gap in sensor data is acceptable because the vibration signal changes slowly at 3.2 kHz."], "correct_index": 1}}, {"id": "tinyml-0711", "title": "The Cache Miss Penalty", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you explain the 2.75x slowdown and calculate the cache miss rate?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 2}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The first run is slow due to TFLite Micro interpreter initialization overhead. Subsequent runs reuse cached interpreter state and skip the setup, achieving the 8ms baseline.", "The 400 KB model far exceeds the 16 KB D-cache. When executing from flash, severe cache thrashing occurs. 14ms of stall = ~6.7M penalty cycles = 1.12M cache misses (6 wait-state penalty). This means the 12,800 cache lines (400 KB) are fetched ~90 times per inference. Fix: place the hottest layers (top 3, 60 KB) in DTCM for zero-wait-state access, reducing inference to ~12ms.", "The slowdown is caused by flash memory write-back operations that occur on first access. The flash controller must initialize its page buffers, which takes exactly 14ms for a 400 KB binary.", "The 2.75x slowdown is due to branch prediction cold-start. The Cortex-M7's branch predictor has no history on the first run, causing pipeline stalls on every conditional branch in the inference loop."], "correct_index": 1}}, {"id": "tinyml-0712", "title": "The STM32H7 Dual-Bank Flash", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does STM32H743 dual-bank flash let a 350 KB model update run over BLE while inference continues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0714", "title": "MCU Flash Wear Monitoring", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the circular flash log be designed so the ML output cadence stays within the 10-year endurance budget?", "chain_ids": ["tinyml-chain-auto-009-06"], "chain_positions": {"tinyml-chain-auto-009-06": 1}, "chain_tiers": {"tinyml-chain-auto-009-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash has 100,000 write cycles — at 1 inference/second, that's enough for 100,000 seconds = 27.8 hours of operation before the flash wears out.", "At 1 Hz with 64 bytes/inference: naive approach erases each 4 KB sector 84.4 times/day, exhausting 100K endurance in 3.25 years (fails 10-year target). Fix: aggregate into 1-minute summaries (32 bytes each), reducing to 0.7 P/E cycles/sector/day = 2,566 cycles over 10 years. Design rule: max output complexity = 16 bytes/inference to stay within the flash endurance budget for 5+ year deployments.", "The flash endurance is determined by the total bytes written, not the number of erase cycles. 64 bytes x 86,400 inferences/day x 3,650 days = 20.2 GB total writes, well within the 2 MB x 100,000 = 200 GB lifetime budget.", "Use the ESP32's built-in wear leveling library, which automatically distributes writes across all flash sectors, ensuring uniform wear and guaranteeing the 10-year lifetime regardless of inference output size."], "correct_index": 1}}, {"id": "tinyml-0715", "title": "Anomaly Detection on Streaming Sensor Data with Limited Memory", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you fit the streaming pipeline, the model, and the baseline statistics in 256 KB SRAM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Store the last 60 seconds of raw sensor data (1.15 MB) as the baseline reference for anomaly comparison, using external PSRAM to overcome the 256 KB SRAM limit.", "Use exponential moving average (EMA) statistics instead of raw data for the baseline. Memory layout utilizes 218.4 KB (85.3%) of the 256 KB SRAM.", "Reduce the sensor sampling rate from 3.2 kHz to 800 Hz to fit 60 seconds of raw baseline data (288 KB) within the 256 KB SRAM budget.", "Run inference only on the most recent 1-second window with no baseline comparison, eliminating the need for storage."], "correct_index": 1}}, {"id": "tinyml-0716", "title": "Execute-in-Place vs Copy-to-SRAM for Model Weights", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the STM32H7 copy all 500 KB of INT8 weights to SRAM, or only the 350 KB of large layers that thrash the M7 D-cache?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 2}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Copy all 500 KB of weights to SRAM before inference. SRAM is 5x faster than flash, so inference speeds up by 5x.", "Selective SRAM copy. Copy only the 5 large layers (350 KB) to SRAM to achieve a 1.87x speedup while saving 150 KB compared to a full copy.", "Keep all weights in flash with XIP. The M7's ART Accelerator handles sequential reads perfectly.", "Use the M7's ITCM (64 KB) for the most frequently accessed weight layers and keep the rest in flash."], "correct_index": 1}}, {"id": "tinyml-0717", "title": "DMA Transfer Time vs Inference Time", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can the system run in real-time without dropping audio samples, and what's the critical timing constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system cannot run in real-time because DMA and CPU share the AHB bus, and the 32 KB/s DMA transfer rate consumes most of the available bandwidth.", "Yes, real-time is feasible. Fill time: 1024 samples / 16 kHz = 64ms. Process time: MFCC (3ms) + inference (15ms) = 18ms. Since 18ms < 64ms, the CPU finishes processing Buffer A before DMA fills Buffer B. CPU utilization: 28%. Critical constraint: process time must always be less than fill time. At 48 kHz sampling (21.3ms fill), utilization rises to 85% — danger zone for jitter.", "The system drops samples because the I2S DMA transfer consumes 50% of the AHB bus bandwidth, leaving insufficient bandwidth for the CPU to access the tensor arena during inference.", "Real-time is possible only if the MFCC feature extraction is offloaded to a hardware DSP accelerator, freeing the CPU to focus exclusively on the 15ms neural network inference."], "correct_index": 1}}, {"id": "tinyml-0718", "title": "Multi-Model SRAM Partitioning", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can VAD, KWS, and command models with 8 KB, 45 KB, and 60 KB arenas coexist in 512 KB Cortex-M7 SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Load all three models' weights into SRAM (20 + 80 + 120 = 220 KB) plus three separate activation arenas (8 + 45 + 60 = 113 KB) = 333 KB total. Fits in 512 KB with 179 KB headroom.", "Exploit the cascading trigger pattern: VAD runs continuously, KWS only on VAD trigger, CMD only on KWS trigger. Since KWS and CMD never run simultaneously, share one arena: max(45, 60) = 60 KB. Total activation SRAM: 8 KB (VAD, always resident) + 60 KB (shared KWS/CMD) = 68 KB. Weights live in flash (220 KB), not SRAM. Total SRAM: 88 KB (17% of 512 KB), leaving 424 KB for expansion.", "All three models must have dedicated, simultaneously-allocated arenas because the audio pipeline requires continuous processing and cannot be interrupted for context switching.", "Use a single 60 KB arena for all three models, swapping model weights from flash between each inference. The 0.5ms weight-loading overhead is negligible compared to inference time."], "correct_index": 1}}, {"id": "tinyml-0720", "title": "The Peak RAM Puzzle", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can an 8-layer CNN needing 300 KB peak activation RAM fit in 256 KB of Cortex-M7 SRAM?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 4}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0721", "title": "The Non-Volatile MRAM Trap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does putting the tensor arena in Apollo4 MRAM drain the battery in 2 days instead of 14?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0722", "title": "The ITCM Execution Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does compiling a custom ML C++ kernel to QSPI Flash cause a 600 MHz Cortex-M7 CPU to effectively run at quarter speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0723", "title": "The L1 Cache Miss Penalty", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does writing to memory with a massive stride destroy performance on an MCU with an L1 Data Cache?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0724", "title": "The Execute-in-Place Energy Tax", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What physical hardware reality did you ignore that causes reading 1 MB over SPI NOR Flash to drain a battery in 2 days?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 3}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0725", "title": "The Multi-Tenant MCU", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an nRF5340 schedule 80 KB KWS and 140 KB command arenas when both must fit in 230 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Allocate 80 KB + 140 KB = 220 KB for both arenas simultaneously. 230 KB available, leaving 10 KB headroom — sufficient for both models to coexist.", "Use two separate TFLite Micro interpreter instances but time-multiplex the physical memory. During KWS (normal): 80 KB arena. When CMD triggers: pause KWS, repurpose the memory for CMD's 140 KB arena. With KWS's 100ms period and 50% window overlap, missing one 45ms window loses zero detections. Peak SRAM: 140 KB, leaving 90 KB for stack, ISR frames, and BLE buffers.", "Run both models in a single TFLite Micro interpreter by concatenating the model graphs. The interpreter will automatically manage memory sharing between the two models.", "Allocate the full 230 KB as a shared arena and let TFLite Micro's dynamic memory planner handle the allocation for both models during concurrent execution."], "correct_index": 1}}, {"id": "tinyml-0726", "title": "The SRAM Bank Conflict Slowdown", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does doubling STM32H7 depthwise channels from 64 to 128 cause a 2.76x slowdown instead of 2x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0727", "title": "The DMA Buffer Corruption", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does Cortex-M7 DMA audio in AXI SRAM produce one confident wrong KWS result every 500 inferences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0728", "title": "The Flash Read Disturb", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a vibration model reading 400 KB weights from flash lose precision after 6 months until re-flashed?", "chain_ids": ["tinyml-chain-auto-009-06"], "chain_positions": {"tinyml-chain-auto-009-06": 2}, "chain_tiers": {"tinyml-chain-auto-009-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash doesn't wear out from reading — only writes cause wear. The accuracy drop must be from a firmware bug that accumulated sensor calibration drift over 6 months.", "NOR flash read disturb: the 400 KB model is read via XIP at 10 inferences/second, accumulating ~1.24 billion reads per cell over 6 months. This exceeds the ~500M read-disturb threshold for 90nm NOR flash, causing ~15% of cells in hot pages to flip bits. The ~480K flipped bits create ~3 LSB average weight noise on INT8 values, matching the observed 12% accuracy drop. Fix: periodically refresh (re-write) the model sectors every ~72 days, or move the model to MRAM/FRAM.", "The flash memory cells are slowly losing charge at room temperature (data retention degradation), causing random bit flips that accumulate over 6 months.", "The MCU's voltage regulator has drifted over 6 months, causing the flash read voltage to shift outside the optimal sensing window, producing intermittent read errors."], "correct_index": 1}}, {"id": "tinyml-0729", "title": "The SRAM Fragmentation Crash", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does AllocateTensors() fail to allocate a 300 KB arena when there is 600 KB of free SRAM?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 4}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["If 600 KB is free and 300 KB is needed, the allocation should succeed. This must be a bug in TFLite Micro's arena allocator that fails on large allocations.", "Heap fragmentation from frequent model switching. Over 2,880 switches (48 hours), small allocations (BLE buffers, DMA descriptors, sensor logs) scatter across the heap. After 48 hours: 600 KB total free but no single contiguous 300 KB block. Fix: reserve a static 300 KB arena at boot (never freed), or use a pool allocator for small objects to prevent heap fragmentation.", "The RTOS is consuming additional SRAM through stack growth in background threads, reducing the actual free memory from 600 KB to below 300 KB over 48 hours.", "A memory leak in the model switching code allocates 200 bytes per switch without freeing, consuming 200 x 2,880 = 576 KB over 48 hours, leaving only 24 KB free."], "correct_index": 1}}, {"id": "tinyml-0730", "title": "The SPI DMA Cache Coherency Failure", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a Cortex-M7 print stale zero values after the DMA successfully writes correct SPI camera pixels into RAM?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 3}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0731", "title": "Flash Endurance Under Continuous Inference Logging", "topic": "vram-budgeting", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does the ML model's output dimensionality dictate the flash write rate, and how does the flash endurance budget determine the maximum model output complexity you can log?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0732", "title": "The Unaligned Struct Padding", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does `struct { char sensor_id; int32_t prediction; char status; }` write 12 bytes instead of 6?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0733", "title": "The Zero-Point Question", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is naive rounding incorrect for INT8 quantization, and what is the role of a zero-point?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0734", "title": "The Quantization Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does pushing a keyword model from INT8 to INT4 collapse accuracy from 91% to 74%, and how do you recover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0735", "title": "The Fixed-Point Accumulator Overflow", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the overflow condition in an INT32 accumulator for an INT8 Conv2D layer, and what is the maximum safe number of accumulations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["INT32 can hold +/- 2 billion. With only 4,608 accumulations (3x3x512), overflow is mathematically impossible regardless of input values.", "INT8 x INT8 max product: 128 x 128 = 16,384. Conv2D 3x3x512: 4,608 accumulations. Worst-case sum: 4,608 x 16,384 = 75.5M. INT32 max: 2.15B. Headroom: 28.5x — safe for this layer. Maximum safe accumulations: 2^31 / (2 x 128^2) = 65,536. Layers exceeding this (e.g., 3x3x8192 at 73,728 accumulations) can overflow, especially with zero-point correction adding to the accumulated values.", "The INT16 intermediate product overflows before reaching the INT32 accumulator. INT8 x INT8 = max 16,384, which exceeds INT16 max (32,767 signed), so every multiply produces garbage.", "The overflow is caused by the bias addition after accumulation, not the MAC operations themselves. The INT32 accumulator handles the MACs fine, but adding a large bias pushes the result past INT32 limits."], "correct_index": 1}}, {"id": "tinyml-0736", "title": "Quantization Error for INT4 on Cortex-M4", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate the accuracy impact of INT4 quantization and the actual inference speedup (or slowdown) on M4F hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["INT4 halves model size and doubles inference speed because you process twice as many values per 32-bit register, achieving 4 MACs/cycle on the M4F.", "INT4 on M4F is a lose-lose: no native INT4 instructions means unpacking to INT8/16 before arithmetic, making it ~2x slower than INT8 (35.8ms vs 17.9ms). Accuracy drops 3-8% (PTQ) or 3% (QAT). Better alternative: prune INT8 to 50% sparsity — same 75 KB size, ~90% accuracy, 9ms inference (4x faster than INT4). INT4 only wins on hardware with native sub-byte SIMD (M55 Helium, NPUs).", "INT4 quantization is always beneficial because it reduces both model size and activation memory by 2x, enabling a second model to fit alongside the keyword spotter.", "INT4 achieves the same accuracy as INT8 when using quantization-aware training, and the 2x size reduction is worth the marginal 10% inference slowdown from unpacking overhead."], "correct_index": 1}}, {"id": "tinyml-0737", "title": "The CMSIS-DSP FFT Scaling Bug", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 256-point fixed-point CMSIS-DSP FFT output a sine amplitude near 1/256, and what scaling behavior did you forget to reverse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0738", "title": "The Per-Channel Trade-off", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might an MCU still choose per-tensor quantization after per-channel recovers only 3% accuracy?", "chain_ids": ["tinyml-chain-auto-secondary-004-17"], "chain_positions": {"tinyml-chain-auto-secondary-004-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-17": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Always use per-channel quantization — it's strictly better because it recovers 3% accuracy with no downsides on any hardware.", "'Per-channel adds ~18% compute overhead (64 separate requantization ops per layer vs 1 for per-tensor). On a 100ms inference budget: per-tensor = 85ms (fits), per-channel = 100ms (barely fits). If deadline is 90ms: only per-tensor works. Per-channel also needs 10 KB metadata (vs 160 bytes) for 20 layers with 64 channels each. Choose per-tensor when latency or memory is tight; per-channel when accuracy is critical and headroom exists.'", "Per-tensor is always better on MCUs because the single scale/zero-point fits in a register, enabling CMSIS-NN to use the fast SMLAD path without any memory lookups.", "The 3% accuracy recovery from per-channel is a benchmark artifact — in real-world deployment on MCUs, per-channel and per-tensor achieve identical accuracy because the quantization noise is dominated by other factors."], "correct_index": 1}}, {"id": "tinyml-0739", "title": "The Binary Neural Network on MCU", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What actual speedup should a binary neural network achieve on Cortex-M4, and why is it not a full 32x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["BNNs execute 32 operations per 1-cycle XNOR, guaranteeing a 32x latency reduction.", "The actual speedup is ~1.6x due to the lack of hardware popcount on M4, requiring 10+ cycles per 32 operations, taking 0.93ms vs INT8's 1.49ms.", "BNNs achieve 32x speedup but fail to converge during training, blocking deployment.", "BNNs take ~60ms because the Cortex-M4 defaults to float point emulation for binary kernels."], "correct_index": 1}}, {"id": "tinyml-0740", "title": "The Integer Arithmetic Engine", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How does an integer-only quantized Conv2D execute on a Cortex-M4 from inputs through requantized outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Simply multiply INT8 inputs by INT8 weights, truncate to INT8, and output. The zero-points and scales are only needed during training, not inference.", "Full integer pipeline: accumulate INT8xINT8 products into INT32, subtract cross-terms involving zero-points, add INT32 bias. Then requantize via integer multiply and bit-shift, apply zero-point, and clamp to [-128, 127]. No floats anywhere.", "The Cortex-M4 must use software floating-point emulation for the scale factors during requantization, adding ~10 cycles per output element.", "The pipeline uses lookup tables (LUTs) for all multiplications: precompute all 256x256 possible INT8 products in a 64 KB table and replace multiplication with table lookups."], "correct_index": 1}}, {"id": "tinyml-0741", "title": "The Volatile Variable Wipe", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does sharing a 10,240-byte TFLite Micro arena between STM32 inference and USB logging cause a hard fault?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0742", "title": "The CMSIS-NN Dimension Limit", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 100,000-sample 1D input produce garbage outputs in CMSIS-NN on STM32 without crashing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 100,000-sample input exceeds the Cortex-M's maximum DMA transfer size.", "CMSIS-NN uses int16_t for dimensions. 100,000 wraps to -31,072. The loop exits immediately, leaving uninitialized memory.", "The input tensor exceeds the MCU's maximum contiguous SRAM allocation, causing silent fallback.", "The STM32's memory protection unit (MPU) triggers a bus fault."], "correct_index": 1}}, {"id": "tinyml-0743", "title": "The CMSIS-NN vs Manual Implementation", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does CMSIS-NN beat plain C by about 8x on a 3x3, 32-to-64 channel Cortex-M4 Conv2D?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0744", "title": "Edge Impulse vs TFLite Micro Deployment", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "When should the team choose Edge Impulse versus direct TFLite Micro for the nRF52840 keyword spotting model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0745", "title": "The CMSIS-NN Speedup", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does CMSIS-NN cut Cortex-M4 INT8 matrix multiply latency from 45 ms to 6 ms without changing the clock?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["CMSIS-NN uses NEON SIMD instructions on the Cortex-M4, processing 4 INT8 values per cycle compared to the naive C code's scalar processing.", "CMSIS-NN exploits the M4's DSP extension — specifically SMLAD (dual 16-bit MAC in 1 cycle). It packs two INT8 values per 16-bit half-word, achieving 2 MACs/cycle. Combined with loop unrolling and cache-friendly data reordering, effective throughput: ~0.6 cycles/MAC vs naive C's ~4.5 cycles/MAC. This is the MCU equivalent of using Tensor Cores on a GPU — specialized hardware paths that most C code never touches.", "The speedup comes entirely from compiler auto-vectorization that the CMSIS-NN headers enable via special pragma directives, not from any ISA-level instructions.", "CMSIS-NN achieves 7.5x speedup by moving the matrix multiply computation to the DMA controller, which performs the arithmetic while the CPU handles other tasks."], "correct_index": 1}}, {"id": "tinyml-0746", "title": "The CMSIS-NN Transpose Overhead", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a PyTorch NCHW CNN spend 30% of Cortex-M4 inference time in TFLite Micro Transpose ops?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["TFLite Micro is inserting unnecessary Transpose ops due to a framework bug. Filing a bug report and rebuilding TFLite Micro from the latest source will eliminate the overhead.", "PyTorch uses NCHW layout; TFLite Micro/CMSIS-NN requires NHWC. The converter inserts Transpose ops between every layer to convert. Transposing 32x32x64 INT8 via strided reads: ~500K cycles per op (no data cache on M4). With 10 layers: ~10M wasted cycles (60ms at 168 MHz). Fix: export from PyTorch in NHWC (channels_last memory format) or convert layouts in ONNX before TFLite export, eliminating all runtime transpose ops.", "The transpose operations are caused by the model having batch normalization layers that require NCHW format, even though the convolutions use NHWC.", "The overhead is unavoidable because CMSIS-NN internally uses NCHW for all computations, requiring a layout conversion before and after every kernel call regardless of the input format."], "correct_index": 1}}, {"id": "tinyml-0747", "title": "The Int8 Asymmetric Zero-Point", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is `(input - zp_in) * (weight - zp_wt)` slower than a symmetric INT8 dot product in TFLite Micro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The zero-point subtraction is optional and can be disabled by setting the quantization mode to 'symmetric' in TFLite Micro's runtime configuration.", "Asymmetric quantization requires subtracting zero-points before every MAC: (input - zp_in) * (weight - zp_wt) = 3 instructions/MAC vs symmetric's 2 (input * weight when zp_wt = 0). For 1M MACs: 50% overhead (3M vs 2M instructions). Fix: use symmetric quantization for weights (force zp = 0 during QAT). CMSIS-NN optimizes symmetric weights via SMLAD (2 MACs/cycle); asymmetric falls back to scalar code.", "The extra subtraction instructions cause pipeline stalls on the Cortex-M4 because the SUB instruction has a 3-cycle latency that cannot be hidden by the in-order pipeline.", "The zero-point subtraction doubles memory bandwidth because each weight must be loaded twice — once for the subtraction and once for the multiplication."], "correct_index": 1}}, {"id": "tinyml-0750", "title": "The Float-to-Double Silent Promotion", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does `float result = sample * 3.14159;` take ~60 cycles on a Cortex-M4F?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0751", "title": "The TFLite Micro Resolving Pointer", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does looking up a tensor address via GetTensor() take 3ms on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["GetTensor() is slow because it reads the full tensor data from flash memory into SRAM on each call, wasting time copying data that's already accessible via XIP.", "TFLite Micro stores tensors in a FlatBuffer format. Each GetTensor() call parses offset tables, performs bounds checking, and resolves nested pointers. With poor cache locality on scattered vtable structures, each lookup incurs multiple cache misses. For 150 GetTensor calls per inference at ~1000 cycles each: 150K cycles = 1-3ms overhead. Fix: use TFLite Micro's static memory planner (pre-resolves all pointers at init) or cache resolved pointers in a lookup table after first inference.", "The 3ms overhead is caused by TFLite Micro's garbage collector, which runs between every layer to reclaim temporary activation buffers.", "GetTensor() acquires a mutex lock for thread safety on each call. The lock/unlock overhead of the RTOS accounts for the 3ms because the RTOS scheduler runs on every mutex release."], "correct_index": 1}}, {"id": "tinyml-0752", "title": "The Operator Fusion on MCU", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much SRAM and latency can fusing Conv2D, BatchNorm, and ReLU save on a Cortex-M7 MobileNet block?", "chain_ids": ["tinyml-chain-auto-secondary-004-32"], "chain_positions": {"tinyml-chain-auto-secondary-004-32": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0754", "title": "TFLite Micro vs TVM vs Custom Compiler", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you compare them across five dimensions: code size, inference speed, memory efficiency, portability, and engineering effort?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0755", "title": "The Model Compression for Flash", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which compression options can reduce the 1.2 MB FP32 person detector to fit within the 800 KB Flash budget?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0756", "title": "Continuous Learning on MCU", "topic": "on-device-learning", "competency_area": "parallelism", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Is it economically and computationally feasible to perform on-device incremental learning on the Cortex-M7 rather than pushing a full model update via satellite?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0758", "title": "TinyML Memory Hierarchy", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "On a typical $2 microcontroller used for TinyML (like an ARM Cortex-M), what are the two main types of memory, and what are they used for?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["HBM for weights and NVMe for activations.", "Flash memory for read-only model weights, and SRAM for intermediate activations.", "L3 Cache for weights and DDR4 for activations.", "Virtual memory backed by a cloud server."], "correct_index": 1}}, {"id": "tinyml-0759", "title": "Integer-Only Inference", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why do frameworks like TFLM heavily emphasize integer-only (INT8) operations instead of FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Because microcontrollers cannot physically read 32-bit words from memory.", "Integer math is mathematically proven to be more accurate for audio processing.", "Many microcontrollers lack hardware Floating Point Units; emulating floats in software is too slow and power-hungry.", "INT8 prevents the microcontroller from being hacked via buffer overflows."], "correct_index": 2}}, {"id": "tinyml-0761", "title": "The Sensor Buffer Overflow", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much SRAM does the input buffer require, and what fraction of total SRAM does it consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1,000 bytes (~0.4% of SRAM)", "3,000 bytes (~1.1% of SRAM)", "6,000 bytes (~2.3% of SRAM)", "30,000 bytes (~11.4% of SRAM)"], "correct_index": 1}}, {"id": "tinyml-0762", "title": "The INT8 Quantization Memory Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much Flash memory does INT8 quantization save, and will the quantized model fit in 1 MB of Flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 250 KB saved (25%), INT8 model is 750 KB", "B) 500 KB saved (50%), INT8 model is 500 KB", "C) 750 KB saved (75%), INT8 model is 250 KB", "D) 875 KB saved (87.5%), INT8 model is 125 KB"], "correct_index": 2}}, {"id": "tinyml-0763", "title": "The Lookup Table vs Compute Tradeoff", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a model layer has 1,000 activations per inference, how many cycles does each approach use, and what is the speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) LUT is 4× faster (30,000 vs 120,000 cycles)", "B) LUT is 10× faster (12,000 vs 120,000 cycles)", "C) LUT is 40× faster (3,000 vs 120,000 cycles)", "D) LUT is 120× faster (1,000 vs 120,000 cycles)"], "correct_index": 2}}, {"id": "tinyml-0764", "title": "The MFCC Feature Memory Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How would you calculate the total SRAM needed for the feature matrix that feeds the neural network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 160 bytes (40 features × 4 bytes)", "B) 1,320 bytes (33 frames × 40 features × 1 byte)", "C) 5,280 bytes (33 frames × 40 features × 4 bytes)", "D) 32,000 bytes (raw audio for 1 second)"], "correct_index": 1}}, {"id": "tinyml-0765", "title": "The Double Buffer DMA Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What throughput and CPU utilization result when DMA takes 1.28 ms and CPU feature extraction takes 0.9 ms per audio frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 58.7 kHz throughput, 41% CPU utilization", "B) 100 kHz throughput, 70% CPU utilization", "C) 100 kHz throughput, 100% CPU utilization", "D) 142 kHz throughput, 50% CPU utilization"], "correct_index": 1}}, {"id": "tinyml-0766", "title": "The Depthwise Separable Ops Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many multiply-accumulate operations does each approach require, and what is the reduction factor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Standard: 75.5M MACs, Separable: 37.7M MACs (2x reduction)", "Standard: 75.5M MACs, Separable: 18.9M MACs (4x reduction)", "Standard: 75.5M MACs, Separable: 8.98M MACs (8.4x reduction)", "Standard: 75.5M MACs, Separable: 0.59M MACs (128x reduction)"], "correct_index": 2}}, {"id": "tinyml-0767", "title": "The Pruning Sparsity Threshold", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why doesn't the 5x weight reduction translate to a 5x speedup, and what sparsity pattern would help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Unstructured gives 5x speedup; structured gives 2x", "B) Both give 5x speedup — the bottleneck is elsewhere", "C) Unstructured barely helps compute; structured gives the real speedup", "D) Neither helps on MCUs; only quantization reduces latency"], "correct_index": 2}}, {"id": "tinyml-0768", "title": "The Watchdog Timer Recovery", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If the watchdog timeout is set to 500ms, what is the maximum downtime per hang event, and what is the risk of an aggressive 150ms timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 500ms max downtime; 150ms timeout is safe with 30ms margin", "B) 700ms max downtime; 150ms timeout risks false resets under ISR preemption", "C) 120ms max downtime; the watchdog resets instantly", "D) No downtime — the watchdog prevents hangs from occurring"], "correct_index": 1}}, {"id": "tinyml-0770", "title": "The CMSIS-NN SIMD Speedup", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the theoretical speedup from SIMD, and explain why the real speedup is closer to 2.5x instead of 4x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0776", "title": "The Knowledge Distillation MCU Fit", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What accuracy can you realistically expect from knowledge distillation at this compression ratio, and what are the memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0778", "title": "The Flash Write Endurance Limit", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How many days until the sector wears out, and how would wear leveling across 8 sectors extend the lifetime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 370 days (1 sector), 2,960 days (8 sectors)", "B) 37 days (1 sector), 296 days (8 sectors)", "C) 3.7 days (1 sector), 29.6 days (8 sectors)", "D) 10,000 days (1 sector), never wears out"], "correct_index": 1}}, {"id": "tinyml-0779", "title": "The Asymmetric Quantization Outlier Drop", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What calibration issue explains the recall drop for extreme sensor values after 8-bit quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0780", "title": "The CMSIS-NN Operator Fallback Latency", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the INT8 CNN become slower than FP32 despite the expected CMSIS-NN SIMD speedups?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 3}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0781", "title": "The QAT Batch Normalization Folding Failure", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What QAT conversion error most likely causes the deployed INT8 Cortex-M4 model to output garbage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0782", "title": "The INT4 Dynamic Unpacking Overhead", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does INT4 quantization make inference slower and more power-hungry than INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0783", "title": "The 32-bit Accumulator Overflow", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do INT16 activations with INT8 weights overflow intermediate buffers during large dot products?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0784", "title": "The Single Batch Memory Bandwidth Wall", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Jetson Orin model lose utilization and miss latency targets when moving from batch size 32 to batch size 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0785", "title": "The Depthwise Separable Arithmetic Intensity Drop", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does replacing standard convolutions with depthwise separable convolutions barely improve Jetson Orin latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0786", "title": "The Flash Memory Wait State Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does overclocking the Cortex-M4 from 120 MHz to 168 MHz fail to reduce TinyML inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0787", "title": "The Unstructured Sparsity Memory Bloat", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning produce no speedup and higher memory use on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0788", "title": "PCIe Transaction Overhead in Edge-to-Cloud GPU Offloading", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the local A100 underutilized when processing many tiny TinyML sensor payloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0790", "title": "SRAM Bank Conflicts During Concurrent DMA and CPU Access", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does DMA audio capture slow CPU inference by 50%, and how should SRAM banks be arranged?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0791", "title": "Tinyml New 0013", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do two models that run in 10 ms alone spike to 45 ms latency when run concurrently on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0792", "title": "Power Consumption Overhead of Execute-In-Place from Flash", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does executing dense model weights directly from Flash increase power and heat on the microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0793", "title": "Heap Fragmentation from Variable-Length Tensor Allocation", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does variable-length dynamic allocation eventually crash the Cortex-M4 despite enough total free SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0794", "title": "Brownout Resets from Peak ALU Switching Current", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can the heaviest convolution layer cause random Cortex-M4 resets even when the software logic is correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0795", "title": "Thermal Throttling Under Continuous Edge ML Workloads", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Jetson Orin inference latency severely degrade after several minutes of continuous operation and only recover after an extended cool-down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0796", "title": "SRAM Retention Leakage in Deep Sleep Mode", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What retained SRAM leakage explains the 20 microamp deep-sleep drain, and how can bank power-down meet the target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0798", "title": "Clock Drift and Feature Misalignment in Time-Series Models", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What sampling-rate drift in the new accelerometer batch makes normal movements look anomalous?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0799", "title": "RTOS Deadline Misses from Blocking CPU Polling I/O", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do camera frame captures cause 1 ms control-loop deadlines to be missed on the M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0800", "title": "False Positives from Seasonal Ambient Temperature Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What long-term sensor drift causes the industrial temperature model to output continuous false positives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0801", "title": "CPU Preprocessing Bottleneck in Edge Vision Pipelines", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What preprocessing bottleneck keeps the video pipeline below 15 FPS despite 5 ms NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0802", "title": "Flash Memory Exhaustion from Dual-Bank A/B OTA Updates", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does safe OTA deployment of a 200 KB model fail on a Cortex-M4 with 256 KB Flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0805", "title": "TFLite Micro Dynamic Interpreter Execution Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What interpreter overhead causes 30% of TFLite Micro inference time to occur outside math kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0806", "title": "Model Weight Corruption from Cross-Architecture Endianness Mismatch", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What endianness mismatch can make weights trained on x86 produce nonsense on a legacy DSP microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0808", "title": "Hardware Resets from Watchdog Timer Starvation During Inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the M4 reset during a 500 ms blocking TinyML inference, and how should inference be scheduled to avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0809", "title": "Audio Buffer Overruns from Unsynchronized DMA Fill Rates", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are audio predictions fluctuating when inference takes 40 ms but DMA fills the buffer every 32 ms, and what should be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0810", "title": "PID Loop Destabilization from RTOS Task Preemption Jitter", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the drone jerk when the TinyML gesture task runs, and how should task priorities protect the PID loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0811", "title": "Flash Capacity Exceeded by Aggressive Loop Unrolling Bloat", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does switching from -Os to -O3 make the Cortex-M4 binary fail to flash, and what optimization strategy should be used?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0812", "title": "UsageFaults from Unaligned 32-bit Memory Accesses", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does reading the 32-bit metadata field in the packed C struct trigger a HardFault on the Cortex-M4, and how can it be resolved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0814", "title": "Memory Allocation Planning for TVM vs TFLite Micro", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the TVM-compiled ResNet OOM while TFLite Micro fits, and what memory planning issue should be corrected?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0816", "title": "Wake-Word Duty Cycle Evaluation Under Power Constraints", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the maximum allowable sampling frequency to meet the power budget, and does the architecture satisfy the latency SLO?", "visual": {"kind": "svg", "path": "tinyml-0816.svg", "alt": "Log-scale timeline diagram showing active power spikes.", "caption": "Power profile of the wake-word sensor duty cycle."}, "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 4}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0818", "title": "Acoustic Monitor Power Budgeting", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Evaluate the exact CPU cycle requirements and design a system frequency configuration to ensure the device remains under a 15mW active compute power budget.", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0819", "title": "BLE Transmission Overlap Diagnosis", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the latency failure in the current sequential pipeline and design a specification to overlap operations using hardware peripherals.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0828", "title": "Intermittent Computing FRAM Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the overhead of this checkpointing mechanism if the model has 10 layers, each producing a 10KB output tensor, and writing to FRAM takes 2 microseconds per byte.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0841", "title": "Implement depthwise separable conv for Cortex-M4 with CMSIS-NN", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the three-layer depthwise separable CNN be implemented with CMSIS-NN and buffered to fit STM32F4 SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0842", "title": "Implement MobileNet-style INT8 quantization for ESP32-S3 inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the MobileNetV1-0.25x model be quantized and deployed on ESP32-S3 for reliable INT8 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0843", "title": "Mastery: depthwise separable conv SRAM layout optimization for Cortex-M4", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the Cortex-M4 depthwise separable layer be tiled and costed to achieve correct SIMD latency and SRAM use?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 4}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0844", "title": "Realization: deploy EfficientNet-inspired tiny CNN on Cortex-M4 within 256KB SRAM", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the architecture to fit within 256KB SRAM and 256KB Flash for model weights, then realize the TFLite Micro deployment?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 3}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0845", "title": "Realization: adapt MobileNetV1-0.25x for ESP32-S3 with resolution scaling", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How must the MobileNetV1-0.25x input resolution and architectural stride pattern be adapted to satisfy the ESP32-S3 latency and SRAM constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0846", "title": "Specification: define CNN architecture constraints for Cortex-M4 vision inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architecture, memory, latency, and accuracy constraints should specify a CNN for 64x64 fault detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0847", "title": "Dataset Curation: Design Training Data Pipeline for MCU Keyword Spotting", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the training data collection and curation pipeline that produces balanced training data suitable for this ultra-constrained deployment context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0848", "title": "Dataset Curation: Fluency — MFCC Feature Storage Budget for MCU Training Data", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you estimate storage requirements for 10K training samples and determine whether they fit in the 8MB PSRAM for on-device training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0849", "title": "Dataset Curation: Implement Feature Extraction Pipeline for Cortex-M4 Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should MFCC feature extraction simulate Cortex-M4 fixed-point behavior to maintain training and inference parity?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 0}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0850", "title": "Dataset Curation: Implement Data Augmentation for TinyML Acoustic Models", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How large is the augmented KWS dataset after applying speed, SNR noise, and SpecAugment, and is it feasible to process?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0851", "title": "Dataset Curation: Mastery — End-to-End Data Strategy for TinyML Anomaly Detection", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the complete data strategy for an anomaly detection model that achieves < 5% false alarm rate and > 90% anomaly detection rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0852", "title": "Dataset Curation: Mastery — Cross-Device Dataset Portability for TinyML Fleet", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a single data curation pipeline that produces training data suitable for all three MCU model variants?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0853", "title": "Dataset Curation: Optimize Dataset Size vs Model Accuracy on Cortex-M4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the most cost-effective path from 89% to 95% keyword-spotting accuracy on Cortex-M4?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 1}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0854", "title": "Dataset Curation: Realize Dataset Storage Architecture for TinyML Training Pipeline", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify file formats, feature pre-computation, quantization pipeline, and validation set splits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0855", "title": "Dataset Curation: Realize PSRAM-Constrained Training Dataset for ESP32-S3", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify what fits in PSRAM, the training batch strategy, and training loop design?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 1}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0856", "title": "Dataset Curation: Recall — What is the Key Data Constraint for TinyML Training?", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What are the three key data constraints that differ from cloud ML training, and why does each matter for model accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0857", "title": "Dataset Curation: Specification — TinyML Dataset Quality SLA for Medical Wearable", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What quantitative dataset quality SLAs are needed for a medical fall detector to meet sensitivity and false alarm targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0858", "title": "Latency Decomposition: Compare Cortex-M4 vs. Cortex-M7+Ethos-U55 for Keyword Spotting", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare inference latency and explain which dominates the power budget for always-on detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0859", "title": "Latency Decomposition: Compare ESP32-S3 vs. Cortex-M4 for Image Classification", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare inference latency accounting for the PSRAM access penalty on the ESP32-S3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0860", "title": "Latency Decomposition: Compute Per-Layer Latency Budget for MCU CNN", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which layer bottlenecks a 2.1 MOP depthwise-separable CNN on a 168 MHz Cortex-M4 at 1 MAC/cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0861", "title": "Latency Decomposition: Full TinyML Pipeline Audit for Predictive Maintenance Sensor", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the predictive maintenance latency pipeline be audited, and which components leave room under the 50 ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0862", "title": "Latency Decomposition: Full TinyML Gesture Recognition Pipeline with Power Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the ESP32-S3 gesture wristband meet its one-week battery target, and what power changes are needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0863", "title": "Latency Decomposition: Optimize MCU Inference Latency from 200ms to Under 50ms", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which optimizations can move the 168 MHz Cortex-M4 speech model from 200 ms toward the 50 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0864", "title": "Latency Decomposition: Size and Validate TinyML Model for Real-Time ECG", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Do the ECG model's 50 KB weights, activations, and buffers fit in 256 KB SRAM on a 168 MHz MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0865", "title": "Latency Decomposition: Size End-to-End Latency for Environmental Sensor TinyML Node", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should end-to-end latency be sized for the ESP32-S3 air quality pipeline, including sensor reads and LoRa transmission?", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0866", "title": "Model Format Conversion: Compare TFLite Micro vs. TensorFlow Lite for MCU Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare TFLite Micro (MCU runtime, <20KB RAM overhead) vs. standard TFLite (GPU delegate) for each platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0867", "title": "Model Format Conversion: Compare TFLM vs. CMSIS-NN for Cortex-M7 Inference", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 500K-parameter INT8 CNN use TFLM with Ethos-U55 delegate or manual CMSIS-NN and Ethos offload?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0868", "title": "Model Format Conversion: Recall TFLite Micro Supported Op List and Limitations", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which common neural network operations are NOT supported in TFLite Micro, and what are the workarounds?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 0}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0869", "title": "Model Format Conversion: Implement TFLite Flatbuffer Size Calculation for MCU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total model size and determine if it fits in the 512KB Flash alongside a 64KB application binary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0870", "title": "Model Format Conversion: Full TFLM Deployment Mastery for Embedded Vision on M7", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the full conversion pipeline's memory layout meet a 50ms latency budget on a battery-powered device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0871", "title": "Model Format Conversion: Full MCU Model Format Pipeline from Research to Flash", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 50K-parameter LSTM for ESP32-S3 stay in TFLM, use a custom engine, or be replaced by a 1D CNN?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0872", "title": "Model Format Conversion: Optimize MCU Model Conversion for Minimal Flash Footprint", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose and quantify optimizations to free 100KB Flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0873", "title": "Model Format Conversion: Size TFLM Model for Cortex-M4 Flash and SRAM", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you size Flash and SRAM for MobileNetV1-0.25 INT8 on a 512 KB Flash, 256 KB SRAM Cortex-M4?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 1}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0874", "title": "Model Format Conversion: Size TFLM Deployment for ESP32-S3 with PSRAM", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What latency penalty comes from placing 475 KB MobileNetV1 weights in ESP32-S3 PSRAM instead of SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0875", "title": "Model Format Conversion: Optimize INT4 Quantization for Cortex-M4 MCU Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate INT4 quantization for the first model and quantify Flash savings vs. accuracy risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0876", "title": "Model Format Conversion: Optimize TFLM Model to Fit Within MCU Flash Constraint", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you quantify the minimum combination of optimizations needed without exceeding 2% accuracy degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0877", "title": "Model Format Conversion: Size Multi-Model TFLM Deployment on Cortex-M7+NPU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should Flash, SRAM, and activation arenas be sized for two triggered TFLM models on Cortex-M7 plus Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0878", "title": "Model Format Conversion: Recall TFLite INT8 Quantization Spec for MCU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the quantization parameters stored per tensor, and why is per-channel required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0879", "title": "Model Size Estimation: Analyze Why MCU Cannot Run INT8 Models Larger Than 200KB", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is 512 KB Flash not the real limit for INT8 model size on Cortex-M4, and what SRAM-based limit applies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0880", "title": "Model Size Estimation: Design Minimum-Memory Architecture for MCU Keyword Spotter", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive layer types, filter counts, activation memory profile per layer, and verify the SRAM constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0881", "title": "Model Size Estimation: Compare DS-CNN-S vs. MobileNetV1-0.25 on ESP32-S3", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which fits ESP32-S3 audio KWS better: DS-CNN-S at 0.9 MOPs or MobileNetV1-0.25 at 14.9 MOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0882", "title": "Model Size Estimation: Compare Quantized vs. Float Model on Cortex-M7+NPU", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare Flash usage, SRAM peak activation memory, inference latency, and power for a 10Hz detection rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0883", "title": "Model Size Estimation: Fluency — Size MCU Model Memory in Under 30 Seconds", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 50K-parameter INT8 model fit in 512 KB Flash and 256 KB SRAM on a Cortex-M4?", "chain_ids": ["tinyml-chain-auto-secondary-003-14"], "chain_positions": {"tinyml-chain-auto-secondary-003-14": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0884", "title": "Model Size Estimation: Master Full MCU Memory Budget for Industrial Sensor Node", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you verify complete system fits and compute inference duty cycle for 10Hz operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0885", "title": "Model Size Estimation: Master Memory-Constrained Model Selection for Medical Wearable", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model and platform choice can meet the medical arrhythmia accuracy and 5-year battery requirements, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0886", "title": "Model Size Estimation: Diagnose and Fix MCU SRAM Overflow at Inference", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose a Cortex-M4 TFLM HardFault when a 180 KB arena fails at the third convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0887", "title": "Model Size Estimation: Diagnose Flash Overflow on ESP32-S3 Multi-Model Deployment", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 two-model firmware report a 2.1 MB overflow when its apparent Flash total is only 5.8 MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0888", "title": "Model Size Estimation: Realize Parameter Count and Memory for MCU Anomaly Detector", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many bias bytes are required for the Conv1D anomaly detector, and why is that count different from the incorrect estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0889", "title": "Model Size Estimation: Realize Full System Memory for ESP32-S3 Voice Assistant", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should SRAM, PSRAM, and Flash be allocated for the ESP32-S3 voice assistant with two models and audio buffering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0890", "title": "Model Size Estimation: Specify Memory-Optimal Architecture for Battery-Powered MCU", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive maximum model parameters, inference latency, duty cycle, and average power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0891", "title": "Model Format Conversion: TFLM Fluency — Compute Activation Arena Size in 30 Seconds", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the minimum TFLM activation arena size needed for this 3-layer INT8 CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0892", "title": "Model Format Conversion: Specify TFLM Deployment Checklist for Safety-Critical MCU Application", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a production deployment checklist for a TFLite Micro model on Cortex-M7+Ethos-U55 (480MHz, 512KB SRAM) used in a safety-critical medical device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0893", "title": "TinyML TCO Recall: Cortex-M4 vs ESP32-S3 Cost Profile", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a 10,000-unit deployment, which platform has lower 2-year TCO?", "chain_ids": ["tinyml-chain-auto-secondary-008-10"], "chain_positions": {"tinyml-chain-auto-secondary-008-10": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0896", "title": "TinyML TCO Design: Optimized Power Budget for Wearable TinyML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can a 100 mAh LiPo meet a 7-day wearable budget with a continuous 2 mA heart-rate sensor and 30-minute BLE syncs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0897", "title": "TinyML TCO Design: Cortex-M7+Ethos-U55 for Industrial TinyML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you calculate the 5-year Total Cost of Ownership (TCO) combining hardware and power costs to determine the most cost-effective architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0898", "title": "TinyML TCO Design: Fleet TCO for Agricultural Sensor Network", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the 5-year TCO for 10,000 ESP32-S3 (240MHz, 512KB SRAM, WiFi) crop monitoring sensors?", "chain_ids": ["tinyml-chain-auto-secondary-008-10"], "chain_positions": {"tinyml-chain-auto-secondary-008-10": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-008-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0901", "title": "TinyML TCO Diagnosis: Battery Life Mismatch Root Cause", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause and calculate the actual vs expected average current?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0902", "title": "TinyML TCO Evaluation: Cortex-M4 vs ESP32-S3 for Production Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the 2-year battery replacement TCO for 500 units?", "chain_ids": ["tinyml-chain-auto-secondary-008-10"], "chain_positions": {"tinyml-chain-auto-secondary-008-10": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0904", "title": "TinyML TCO Evaluation: Cloud-in-the-Loop vs Fully On-Device TinyML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "When should the predictive maintenance fleet use on-device TinyML versus cloud inference, and what is the three-year TCO?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0905", "title": "TinyML TCO Fluency: Quick Cost Estimation for TinyML Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the quick estimates for sensor energy cost, ESP32-S3 cloud break-even volume, and coin-cell battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0906", "title": "TinyML TCO Fluency: Rapid BOM and Power Budget for TinyML Product", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected battery life of this system at 3V, and does it meet a 10-year requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0908", "title": "TinyML TCO Implement: TinyML Inference Energy per Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the energy per inference (in μJ) and the cost for 1 million inferences on both platforms, and which factor is the true differentiator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0910", "title": "TinyML TCO Implement: ESP32-S3 vs Cloud for TinyML Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do hardware, power, cloud inference, data transmission, and SIM costs compare for ESP32-S3 versus cloud inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0911", "title": "TinyML TCO Mastery: Full Lifecycle TinyML Product Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the five-year lifecycle costs and margin for the ESP32-S3 air quality monitor product?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0912", "title": "TinyML TCO Mastery: Make On-Device vs Cloud Decision for Industrial TinyML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which make, buy, or cloud option minimizes five-year TCO for 50,000 vibration sensors, and why?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0914", "title": "TinyML TCO Optimization: Quantization Savings for TinyML Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the battery life improvement, fleet-wide annual power savings, and is the $20K quantization engineering cost justified over 3 years?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0915", "title": "TinyML TCO Optimization: Reduce TinyML Fleet Management Costs", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What automation opportunities exist for each category, and what are the quantified savings and 3-year NPV at a 5% discount rate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0919", "title": "TinyML TCO Realization: Total Platform Cost for TinyML Development", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total first-year platform development cost for the ESP32-S3 TinyML product, including tools, cloud, labels, and engineering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0920", "title": "TinyML TCO Specification: Design $100K Budget TinyML Infrastructure", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you verify against the $100K budget constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0922", "title": "TinyML TCO Specification: Design TinyML Fleet Budget for Healthcare", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What five-year budget should a hospital plan for 200 regulated TinyML vital sign monitors, and what dominates the cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0923", "title": "Analyze Transformer FLOPs on Cortex-M7+Ethos-U55", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much MobileViT computation falls back to the Cortex-M7 versus Ethos-U55, and how does that affect latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0924", "title": "Analyze Chinchilla Scaling for MCU-Deployable Transformers", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the tension between Chinchilla-optimal training compute and the memory constraints forcing architectural compromises at inference?", "chain_ids": ["tinyml-chain-auto-secondary-004-20"], "chain_positions": {"tinyml-chain-auto-secondary-004-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0925", "title": "Design Speculative Decoding for Cortex-M4 Transformer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the speculation window size and fallback strategy to maximize tokens/second?", "chain_ids": ["tinyml-chain-auto-secondary-004-21"], "chain_positions": {"tinyml-chain-auto-secondary-004-21": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0926", "title": "Design Attention Approximation for 256KB SRAM Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What attention approximation fits a 128-token transformer context into the ESP32-S3 KV-cache budget while preserving accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0927", "title": "Design Layer-Wise Quantization Schedule for Transformer on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a mixed-precision quantization schedule that preserves accuracy while fitting memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0928", "title": "Diagnose Transformer Inference Latency Regression on ESP32-S3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the ESP32-S3 transformer jump from 85 ms to 340 ms when context increased from 32 to 128 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0929", "title": "Diagnose Attention Head Collapse in Quantized MCU Transformer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why INT4 caused head collapse and propose a targeted fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0930", "title": "Diagnose Memory Fragmentation During Transformer Layer Execution", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 transformer with 160 KB free heap fail to allocate a 128 KB QK^T buffer, and how can it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0931", "title": "Evaluate Transformer vs CNN for Wake-Word Detection on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is a 500K-parameter transformer worth 1.4% KWS accuracy over a DS-CNN given 15 versus 69 days of battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0932", "title": "Evaluate KV Cache Compression Techniques on Ethos-U55", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which KV-cache compression gives the best accuracy-memory Pareto point for the 4-layer transformer on Cortex-M7+Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0933", "title": "Evaluate Pruning Strategies for Transformer Attention on ESP32-S3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which ESP32-S3 pruning mix gets a 140 ms transformer under 100 ms with the least accuracy loss?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0934", "title": "Flash Attention FLOPs Formula for Tiny Transformers", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What total FLOPs should be used for the tiny transformer forward pass, and what calculation pitfalls must be avoided?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0935", "title": "Recall Arithmetic Intensity Threshold for Attention on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity threshold, and is standard dot-product attention compute-bound or memory-bound at sequence length 64 with 64-dim heads?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0936", "title": "Recall Memory-Bandwidth-Bound Decode on Embedded Hardware", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What arithmetic-intensity threshold makes a 500K-parameter ESP32-S3 transformer decode compute-bound with PSRAM latency?", "chain_ids": ["tinyml-chain-auto-secondary-004-21"], "chain_positions": {"tinyml-chain-auto-secondary-004-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0937", "title": "Implement Tiled Matrix Multiply for Attention on Cortex-M7", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you determine optimal tile size and compute expected speedup over naive implementation?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0938", "title": "Achieve Mastery in Transformer Inference Optimization on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What optimization roadmap gets the Cortex-M7+Ethos-U55 transformer from 280 ms and 8.2 mW to the targets?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0939", "title": "Master Flash-Aware Transformer Scheduling on ESP32-S3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How feasible is SPI layer-swapping for a 12 MB transformer on ESP32-S3, and what prefetch and compression schedule can meet 1 s latency?", "chain_ids": ["tinyml-chain-auto-secondary-004-20"], "chain_positions": {"tinyml-chain-auto-secondary-004-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0940", "title": "Master Transformer FLOPs-to-Power Model for Battery Sizing", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you compute the active energy per inference, sleep energy between inferences, total annual energy, and required battery capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0941", "title": "Optimize Prefill Batching on Ethos-U55 for Batch Documents", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What batch size should the Ethos-U55 use for eight simultaneous 64-token prefill jobs under the 256 KB SRAM budget to maximize throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0942", "title": "Optimize Weight Sharing Across Transformer Layers on Flash-Limited MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the computational cost increase, memory impact, and expected accuracy trade-off?", "chain_ids": ["tinyml-chain-auto-secondary-004-20"], "chain_positions": {"tinyml-chain-auto-secondary-004-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0943", "title": "Recall Transformer Token Budget on TinyML Hardware", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What maximum transformer sequence length fits in a 128 KB KV cache on Cortex-M4 with the given layers, heads, and head size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0944", "title": "Specify Transformer Architecture for Sub-1mW Inference on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify the number of layers, heads, embedding dimension, FFN ratio, and quantization precision to fit the 0.5mW power budget with >90% accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0945", "title": "Specify KV Cache Layout for Ethos-U55 DMA Access Patterns", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify the complete KV cache memory map including layer offsets, head offsets, and alignment padding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0946", "title": "Realize Quantization-Aware Training Pipeline for MCU Transformer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should an INT8 QAT pipeline be staged for the Cortex-M7 transformer to recover F1 within the 24 GPU-hour budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0947", "title": "Realize Streaming Transformer Inference with Circular KV Buffer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a circular KV buffer update 20 ms speech chunks on ESP32-S3 while maintaining a 128-token sliding window?", "chain_ids": ["tinyml-chain-auto-secondary-004-21"], "chain_positions": {"tinyml-chain-auto-secondary-004-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0948", "title": "Analyze TCO of Custom ASIC vs MCU for TinyML at Scale", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze 5-year TCO for both options including warranty costs from missed anomalies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0951", "title": "Cortex-M4 vs NPU Latency Analysis", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the theoretical compute-bound latency for both architectures that explains why the slower-clocked NPU outperforms the CPU?", "chain_ids": ["tinyml-chain-auto-secondary-012-16"], "chain_positions": {"tinyml-chain-auto-secondary-012-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0952", "title": "Adversarial Denial of Sleep Attack Analysis", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the systemic impact of this adversarial perturbation on the device's power budget and operational lifespan?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0953", "title": "Depthwise Convolution Latency Discrepancy", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the latency improvement not match the MAC reduction, and why did the SRAM footprint grow on this architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0954", "title": "NPU Utilization and Cycle Cost Analysis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the actual cycle cost for both layers and explain why the depthwise layer's compute efficiency drops so drastically on this shared-memory architecture?", "chain_ids": ["tinyml-chain-auto-secondary-004-02"], "chain_positions": {"tinyml-chain-auto-secondary-004-02": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0955", "title": "Active Learning SRAM Overflow", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does curating a dataset of low-contrast anomalies cause these specific hardware failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0956", "title": "DMA Overhead on Small Audio Chunks", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does DMA raise power for 32-byte I2S transfers on a 128 MHz nRF5340 instead of saving it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0957", "title": "Battery Drain Anomaly in Dual-Core Always-On Audio", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the dual-core always-on audio system drain the 200mAh coin cell in 4 days despite the application core's low 10% duty cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0958", "title": "Energy Impact of PSRAM Weight Offloading", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving 400 KB of weights from SRAM to PSRAM greatly increase the ESP32-S3 energy per inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0959", "title": "4-Bit Quantization Latency Regression on nRF5340", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did 4-bit weight quantization on the nRF5340 shrink flash usage but increase inference latency and battery drain?", "chain_ids": ["tinyml-chain-auto-secondary-009-07"], "chain_positions": {"tinyml-chain-auto-secondary-009-07": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0961", "title": "Analyze Fallback Latency", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Explain why this latency overrun happens on the MCU when falling back from a 1M-MAC to a 4M-MAC CNN.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0962", "title": "Operator Fusion Memory Tradeoff on nRF5340", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does fusing operators to reduce latency cause a significant increase in peak SRAM usage?", "chain_ids": ["tinyml-chain-auto-secondary-004-32"], "chain_positions": {"tinyml-chain-auto-secondary-004-32": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0963", "title": "The Distillation Temperature Bottleneck", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does this specific layer cause such a severe performance degradation, and what is the optimal deployment strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0964", "title": "Latency Decomposition of SPI Camera Pipeline on Corstone-300", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is Corstone-300 vision latency about 40 ms when Ethos-U55 inference is only 0.04 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0965", "title": "NPU to CPU Fallback Latency Analysis", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does replacing ReLU with a zero-MAC custom activation increase total layer execution time by roughly 60x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0966", "title": "ESP32-S3 PSRAM Bandwidth Bottleneck Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do ESP32-S3 INT8 weights in external PSRAM dominate latency despite 240 MHz vector compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0967", "title": "ESP32-S3 Quantized Model Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 INT8 TFLM model run slowly when ESP-NN vector kernels are not linked?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 1}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0968", "title": "Analyzing SRAM Overflow in Cortex-M4 Residual Block", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this memory failure occur despite the largest individual activation size being well under the 256 KB SRAM limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0969", "title": "NAS Memory Constraint Analysis on nRF5340", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the NAS reject the nRF5340 inverted bottleneck block despite its weights fitting in Flash?", "chain_ids": ["tinyml-chain-auto-secondary-011-20"], "chain_positions": {"tinyml-chain-auto-secondary-011-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0970", "title": "Layer Fusion Latency Penalty on Cortex-M4", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this layer fusion scheduling behavior degrade execution speed on this specific hardware architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0971", "title": "OTA Rollback due to Shared SRAM Exhaustion", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Corstone-300 OTA rollback after boot when a 320 KB tensor arena and 200 KB network stack share 512 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-004-12"], "chain_positions": {"tinyml-chain-auto-secondary-004-12": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0972", "title": "NPU Offloading Energy Overhead Paradox", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this system behavior occur, and what is the total energy-per-inference for both CPU-only and NPU-offloaded execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0973", "title": "ESP32-S3 PSRAM Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you analyze why the execution takes 10 ms despite the high-speed CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0974", "title": "Unstructured vs Structured Pruning on Ethos-U55", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does unstructured sparsity provide no speedup on the Ethos-U55, and what is the compute cycle difference with 50% structured channel pruning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0975", "title": "Asymmetric Weight Quantization Penalty", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you explain mathematically why changing the weights to asymmetric quantization causes such a drastic latency spike on this specific hardware?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0976", "title": "Analyzing Preemption and Real-Time Misses on STM32F4", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does theoretical execution time differ from the observed 12ms, and what causes the deadline miss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0977", "title": "Latency Analysis of On-Device Privacy Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the guardrail cause this specific latency bottleneck, and what is its minimum execution time floor due to memory fetches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0978", "title": "ESP32-S3 PSRAM Roofline Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the roofline model, why are the vector extensions underutilized and what is the layer's operational intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0979", "title": "Watchdog Resets in Dual-Core Shared SRAM Inference", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why the device experiences intermittent watchdog resets during these network events?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0980", "title": "DMA Buffer Overrun with BLE on ESP32-S3", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this happen based on the system's temporal constraints, and what is the maximum time the BLE stack can block the core before causing an overflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0981", "title": "Operator Scheduling for Peak SRAM Reduction", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does changing the execution order of these parallel branches alter the peak static memory requirement in the flat tensor arena?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 2}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0982", "title": "ESP32-S3 vs External NPU for Edge AI", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the system architecture, compare the compute tradeoffs, and justify your choice between native ESP32-S3 execution versus an external ASIC/NPU?", "chain_ids": ["tinyml-chain-auto-secondary-012-15"], "chain_positions": {"tinyml-chain-auto-secondary-012-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0983", "title": "Secure TinyML Keyword Spotting Architecture", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a defense strategy that masks execution patterns without violating a strict 50ms end-to-end inference latency budget?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0984", "title": "Architecting Energy-Constrained Audio Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can the MCU run the 1.2 MMAC acoustic model for one-year battery life, and what duty-cycle architecture is needed?", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0985", "title": "Active Learning Data Pipeline for nRF5340 Anomaly Detection", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should nRF5340 vibration sensors curate active-learning data without continuously streaming raw signals?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 2}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0986", "title": "Energy-Aware Model Architecture for Corstone-300", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 vision wake-word model minimize SRAM access energy while using the Ethos-U55 efficiently?", "chain_ids": ["tinyml-chain-auto-secondary-012-18"], "chain_positions": {"tinyml-chain-auto-secondary-012-18": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0987", "title": "Architecting Sub-8-bit Weight Quantization for Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect an inference system utilizing extreme weight quantization to fit the model on-device while maintaining real-time execution speeds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0988", "title": "ESP32-S3 Acoustic Degradation Ladder", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a graceful degradation strategy that maintains fail-operational safety monitoring while adapting to constrained resources?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0989", "title": "AOT Compilation Pipeline Design for Cortex-M4 Edge Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What AOT compilation passes are required to turn the Cortex-M4 keyword model into a static CMSIS-NN executable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0990", "title": "Designing a Distillation Pipeline for ESP32-S3 Audio Classification", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the tradeoffs between logit matching and feature distillation while maximizing the use of the ESP32-S3's INT8 vector extensions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0991", "title": "Dual-Core Memory Allocation for Keyword Spotting", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs of different memory allocation strategies considering the shared SRAM constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0992", "title": "Ethos-U55 Fallback Delegation and SRAM Strategy", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the model conversion and runtime delegation strategy to handle this fallback without exceeding the 512 KB SRAM limit or causing severe latency bottlenecks?", "chain_ids": ["tinyml-chain-auto-027-02"], "chain_positions": {"tinyml-chain-auto-027-02": 1}, "chain_tiers": {"tinyml-chain-auto-027-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0993", "title": "Architecting a KWS Memory Pipeline for ESP32-S3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the ESP32-S3's internal SRAM and external PSRAM be partitioned to meet real-time latency constraints with WiFi enabled?", "chain_ids": ["tinyml-chain-auto-secondary-003-16"], "chain_positions": {"tinyml-chain-auto-secondary-003-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0994", "title": "Hardware-Aware NAS Design for ARM Cortex-M4", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How must the NAS constraint estimators evaluate peak memory and latency without physically deploying every candidate architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0995", "title": "Dual-Core Power Partitioning on nRF5340", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should DSP, inference, and BLE work be partitioned across nRF5340 cores to meet a 1 mW audio anomaly budget?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0996", "title": "Profiling CPU-NPU Memory Contention on Corstone-300", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a non-intrusive trace and profiling strategy to definitively isolate whether the bottleneck is compute-bound on the NPU or memory-bound due to SRAM contention?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0997", "title": "Pruning Pipeline Design for nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What pruning strategy should compress the CNN while reducing both SRAM and latency, and how does it align with the MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0998", "title": "ESP32-S3 Quantization Strategy Design", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which ESP32-S3 quantization strategy fits the predictive maintenance model in fast SRAM without sacrificing INT8 acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0999", "title": "On-Device Bias Guardrails for Edge Audio", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an on-device OOD guardrail for respiratory audio be partitioned between Cortex-M7 and Ethos-U55 within 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1000", "title": "Real-Time Vibration Ingestion Pipeline on Corstone-300", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the streaming ingestion pipeline, detailing your DMA buffering strategy, feature computation overlap, and memory partitioning to guarantee no dropped frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1001", "title": "NPU Shared SRAM Bus Contention Side-Channel", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this vulnerability, and how does the attack mechanism extract the model architecture?", "chain_ids": ["tinyml-chain-auto-secondary-011-16"], "chain_positions": {"tinyml-chain-auto-secondary-011-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1002", "title": "ESP32-S3 PSRAM Bottleneck Diagnosis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this massive compute estimation discrepancy and how would you prove it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1003", "title": "Diagnosing Domain Shift in ESP32-S3 Wake-Word Datasets", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of the performance drop and explain how you would curate the dataset to resolve it?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 2}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1004", "title": "NPU Inference Latency and CPU Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does copying a 256x256 camera frame on the 480 MHz M7 add about 0.7 ms before Ethos-U55 inference?", "chain_ids": ["tinyml-chain-auto-secondary-010-17"], "chain_positions": {"tinyml-chain-auto-secondary-010-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1005", "title": "Diagnosing Power Regressions in Sparse Networks", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can an 80% sparser Cortex-M4 network triple active energy per inference despite utilizing CMSIS-NN SIMD instructions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1006", "title": "4-bit Quantization Fallback Diagnosis on Ethos-U55", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 4-bit Ethos-U55 keyword model peg the 480 MHz M7 CPU while the NPU sits idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1007", "title": "Distilled Student SRAM Exhaustion", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can a smaller distilled nRF5340 model OOM while a larger pruned model runs successfully?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1008", "title": "Diagnosing Flash Wait State Stalls in CMSIS-NN", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 168 MHz STM32F4 stall when CMSIS-NN streams 800 KB of INT8 weights from flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1009", "title": "Diagnosing Latency Spikes from Unoptimized Fallbacks", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an INT8 depthwise convolution on STM32F4 take 890 ms instead of 30 ms, and how should it be fixed?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 3}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1010", "title": "NAS Latency Regression on Ethos-U55 NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does an Ethos-U55 NAS candidate with fewer FLOPs run 3x slower on a 512 KB Corstone-300 system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1011", "title": "Diagnosing SRAM Overflow During Concurrent Bluetooth Operations", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the nRF5340 audio model OOM during residual blocks only while BLE 5.3 is transmitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1012", "title": "Diagnosing Battery Drain in ESP32-S3 Wake-Word Engine", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware or software interactions are likely causing this massive power budget violation, and how would you diagnose the root cause?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1013", "title": "Diagnosing Asymmetric Quantization Overhead", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do asymmetric INT8 weights make the nRF5340 wake-word model fall from 20 ms to about 600 ms?", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1014", "title": "Floating-Point Emulation Overhead in Safety Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Cortex-M4 OOD guardrail push latency past 60 ms, and how should the distance computation be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1015", "title": "HardFault During High-Frequency Sensor Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Cortex-M4 with 256 KB SRAM HardFault after buffering 1 s of 50 kHz vibration data and an 80 KB model arena?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1016", "title": "Diagnosing Peak SRAM OOM on Nordic nRF5340", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Nordic nRF5340 HardFault on one convolution despite 400 KB weights fitting in flash and tensors summing under 256 KB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1017", "title": "Evaluating Adversarial Defenses on ESP32-S3 Smart Locks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate these alternatives and determine the best approach given the hardware constraints?", "chain_ids": ["tinyml-chain-auto-secondary-011-18"], "chain_positions": {"tinyml-chain-auto-secondary-011-18": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1018", "title": "Evaluating Model Architectures for Cortex-M4", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which STM32F4 model is viable at 10 Hz with 50% CPU reserved: 15M FP32 MACs or 12M INT8 MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1019", "title": "Active Learning Curation for INT8 Constrained Edge Devices", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which active learning curation pipeline do you choose and how do you implement the selection efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1020", "title": "Audio Pipeline DMA Tradeoffs on ESP32-S3", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory pipeline design maximizes inference bandwidth and CPU sleep time on the ESP32-S3?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1021", "title": "Energy-Aware Model Selection on nRF5340", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model (SRAM-bound Model A vs Flash-bound Model B) minimizes total energy per inference on the nRF5340, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1022", "title": "NPU Thermal Fallback Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which fallback design should run when the Ethos-U55 is thermally power-gated, and does it meet the 5 ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1023", "title": "Evaluating Distillation Strategies for Ethos-U55", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which distillation proposal best fits a 512 KB Cortex-M7+Ethos-U55 KWS deployment and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1024", "title": "Evaluating Pipeline Latency on Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which STM32F4 pipeline is lower latency: a soft-float 1024-point FFT with 50k MACs or integer preprocessing with 200k MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1025", "title": "TFLite Micro vs AOT Compilation on nRF5340", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do TFLite Micro and AOT compilation compare regarding SRAM footprint, operator support, and runtime overhead on the nRF5340?", "chain_ids": ["tinyml-chain-auto-027-04"], "chain_positions": {"tinyml-chain-auto-027-04": 1}, "chain_tiers": {"tinyml-chain-auto-027-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1026", "title": "Evaluating Memory Constraints on Shared SRAM Architectures", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Corstone-300 model is feasible: 120K params with 260 KB activations or 190K params with 110 KB activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1027", "title": "Hardware-Aware NAS Tradeoffs on ESP32-S3", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which architecture do you choose for deployment, and how do you justify the tradeoff between PSRAM latency and vectorization speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1028", "title": "Ethos-U55 Depth-First Scheduling for Memory Reuse", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which scheduling strategy is optimal given the shared SRAM constraints, and why?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1029", "title": "Evaluating Profiling Strategies for nRF5340 Audio Inference Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which profiling approach is better for isolating compute vs. I/O bottlenecks given the 256 KB SRAM constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1030", "title": "Structured vs Unstructured Pruning on nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which pruning strategy is better suited to fit the model onto the nRF5340 while minimizing execution time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1031", "title": "On-Device Anti-Spoofing Guardrail Evaluation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an nRF5340 smart lock run a 150 KB anti-spoofing model locally or send 16 KB audio to a phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1032", "title": "Audio Streaming DMA vs Interrupts on nRF5340", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an audio detector use 16 ms ADC interrupts or EasyDMA into a 32 KB ping-pong buffer, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1033", "title": "Evaluate SRAM vs Flash Tensor Arena Placement", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory planning strategy yields better system performance, and what are the architectural tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1034", "title": "ESP32-S3 Vector Extension Efficiency", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are ESP32-S3 vector and scalar MAC throughputs for 20 MMAC at 10 FPS, and how much CPU utilization does each require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1035", "title": "Randomized Smoothing Latency on Cortex-M4", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long does 20-pass randomized smoothing take on Cortex-M4, including overhead, and does it meet the 500 ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1036", "title": "Inference Latency and Energy on Nordic nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What latency and energy per inference result from a 2.5 MOP keyword model on nRF5340 at 128 MHz and 5 mA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1037", "title": "Active Learning Storage Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many 1-second uncertain audio clips can be stored on-device before needing to offload data?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 0}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1038", "title": "DMA Cycle Stealing Overhead", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What DMA bandwidth and cycle-stealing overhead does a 96x96 camera stream impose on inference when running at 168 MHz?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1039", "title": "Energy Cost of Memory vs Compute on Ethos-U55", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy do Ethos-U55 dense-layer MACs and SRAM weight fetches consume, and which dominates?", "chain_ids": ["tinyml-chain-auto-secondary-012-18"], "chain_positions": {"tinyml-chain-auto-secondary-012-18": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1040", "title": "ESP32-S3 Low-Battery Model Fallback", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the ESP32-S3 inference latencies for the primary and low-battery fallback doorbell models?", "chain_ids": ["tinyml-chain-auto-secondary-012-19"], "chain_positions": {"tinyml-chain-auto-secondary-012-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1041", "title": "BatchNorm Constant Folding Flash Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the exact amount of Flash memory saved by this optimization, and what fraction of the nRF5340's total 1 MB Flash does this represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1042", "title": "Distilled INT8 Model Latency on ESP32-S3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Can an ESP32-S3 INT8 student model with 4.5M MACs meet a 15 ms audio-frame deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1043", "title": "nRF5340 End-to-End Wake Word Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total end-to-end latency from the moment the audio frame is ready for processing to the completion of the BLE transmission?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1044", "title": "SRAM Capacity Limits on Corstone-300", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does the Corstone-300 convolution fit in 512 KB SRAM without tiling, and how much memory is missing?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 0}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1045", "title": "Ethos-U55 CPU Fallback Latency Penalty", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What execution-time penalty occurs when a 4.8M-MAC Ethos-U55 layer falls back to the Cortex-M7, assuming standard CMSIS-NN throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1046", "title": "ESP32-S3 INT8 Keyword Spotting Memory Footprint", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does the INT8 keyword-spotting CNN fit entirely in ESP32-S3 internal SRAM, or must it use the slower PSRAM?", "chain_ids": ["tinyml-chain-auto-secondary-003-16"], "chain_positions": {"tinyml-chain-auto-secondary-003-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1047", "title": "Calculate Max MACs for NAS Search Space Constraints", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What absolute maximum number of INT8 MACs should a Cortex-M4 NAS enforce for a 15 ms budget at 60% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1048", "title": "Layer Fusion for SRAM Peak Memory Reduction", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the peak memory footprint for standard sequential scheduling versus fused operator scheduling, and determine whether they fit entirely within the fast SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1049", "title": "CPU vs External NPU Bottlenecks", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the external I2C NPU slower than local CMSIS-NN on the Cortex-M4 for this convolution layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1050", "title": "Denial-of-Sleep via Early Exit Exploitation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does a denial-of-sleep attack on early exits change average current and coin-cell battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1051", "title": "ESP32-S3 Depthwise Memory Bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this hardware-specific bottleneck occur despite the massive reduction in mathematical operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1052", "title": "Non-linear Latency Scaling on Corstone-300", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a mere 20% increase in compute requirements result in a 7.5x increase in total inference latency on the Corstone-300?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1053", "title": "Calibration Dataset Outlier Bias", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does this specific calibration dataset composition cause the integer-only NPU to fail on subtle inputs?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 1}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1054", "title": "DMA Buffering vs CPU Interrupts on nRF5340", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the CPU-driven approach cause overruns, and what is the CPU context-switch time saved per second by switching to DMA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1055", "title": "Coin Cell Capacity Degradation Under Inference Load", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the 5mA pulse load cause the device to prematurely brownout and fail at less than 50% of the coin cell's rated capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1056", "title": "Memory vs Compute Energy in ESP32-S3 Workloads", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 100k-MAC fully connected layer in PSRAM use more energy than a 500k-MAC convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1057", "title": "Ternary Unpacking Overhead on Cortex-M33", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does ternary packing shrink an nRF5340 KWS model from 100 KB to 25 KB but add 3.1 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1058", "title": "Fallback Model Latency Scaling Analysis", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an 8x smaller fallback CNN reduce MCU inference latency by only about half?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1059", "title": "AOT Compiler Memory-Latency Tradeoff Analysis", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the architectural tradeoffs the compiler made to achieve this memory reduction, and calculate the energy difference per inference?", "chain_ids": ["tinyml-chain-auto-secondary-004-31"], "chain_positions": {"tinyml-chain-auto-secondary-004-31": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1060", "title": "Wake-Word Pipeline Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which stages dominate the Corstone-300 wake-word pipeline latency, and why is the Ethos-U55 not the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1061", "title": "NPU Fallback Bottleneck on Ethos-U55", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this massive latency spike occur, and what are the underlying compute and memory constraints of this heterogeneous setup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1062", "title": "ESP32-S3 Inference Latency Anomaly Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving 350 KB of ESP32-S3 INT8 weights from SRAM to PSRAM raise inference latency from 15 ms to 75 ms?", "chain_ids": ["tinyml-chain-auto-026-02"], "chain_positions": {"tinyml-chain-auto-026-02": 1}, "chain_tiers": {"tinyml-chain-auto-026-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1063", "title": "Analyzing SRAM OOM in Early Convolution Layers", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can the first 64x64x32 INT8 convolution OOM an STM32F4 despite weights staying in flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1064", "title": "Hardware-Aware NAS SRAM Bottleneck on nRF5340", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the hardware-aware NAS reject high-accuracy candidate architectures despite their small 150 KB parameter size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1065", "title": "SRAM Optimization via Fused Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the default sequential schedule fail, and how does the fused execution order resolve the SRAM bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1066", "title": "SRAM Contention During A/B OTA Updates", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Corstone-300 OTA update OOM when a 360 KB NPU model, 100 KB RTOS stack, and flash buffer share 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1067", "title": "Energy Analysis of DVFS on Ethos-U55", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this non-linear energy savings occur, and what is the dynamic energy per inference for both P-states?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1068", "title": "PSRAM Bandwidth Bottleneck on ESP32-S3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 spend 25 ms on a 1 MB INT8 model stored in 80 MHz Quad SPI PSRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1069", "title": "Unstructured Pruning Latency Regression", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does unstructured pruning cause a latency regression on the Ethos-U55 architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1070", "title": "Per-Channel Requantization Overhead Analysis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many extra cycles per channel explain the 4.57 us per-channel quantization overhead on STM32F4, given a 168 MHz clock?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1071", "title": "Interrupt-Driven Missed Audio Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 600,000-cycle Bluetooth interrupt make a 168 MHz Cortex-M4 audio pipeline miss a 16 ms frame deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 2}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1072", "title": "Fairness Guardrail PSRAM Latency Bottleneck on ESP32", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the ESP32-S3 fairness guardrail violate the 50 ms SLA after moving weights to PSRAM?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1073", "title": "ESP32-S3 SRAM vs PSRAM Roofline Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does moving ESP32-S3 CNN weights from SRAM to PSRAM shift the roofline and reduce utilization?", "chain_ids": ["tinyml-chain-auto-secondary-013-24"], "chain_positions": {"tinyml-chain-auto-secondary-013-24": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1074", "title": "Shared SRAM Contention WDT Resets", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the system pass lab tests but experience WDT resets in the field during heavy BLE activity?", "chain_ids": ["tinyml-chain-auto-secondary-008-08"], "chain_positions": {"tinyml-chain-auto-secondary-008-08": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1075", "title": "PSRAM Latency in Real-Time Audio Streaming", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does placing the rolling audio DMA buffer in ESP32-S3 PSRAM cause dropped real-time frames?", "chain_ids": ["tinyml-chain-auto-secondary-013-28"], "chain_positions": {"tinyml-chain-auto-secondary-013-28": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1076", "title": "Tensor Arena Peak Overlap Analysis", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the static memory planner fail with an OOM error at the Conv2D layer, despite the largest single tensor fitting in SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1077", "title": "Evaluating ESP32-S3 Vector Extensions for Audio ML", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the tradeoffs between FP32 on the standard CPU versus INT8 on vector extensions for the ESP32-S3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1078", "title": "Side-Channel Defense Architecture for STM32F4 Audio Authentication", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the STM32F4 speaker verification model defend against timing and power side-channel extraction within memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1079", "title": "nRF5340 Dual-Core Compute Partitioning", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the compute pipeline, DSP, model inference, and BLE tasks be partitioned across the dual cores to minimize energy consumption while respecting the hardware limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1080", "title": "Active Learning Pipeline for nRF5340 Wearable", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should an nRF5340 wearable selectively buffer and transmit active-learning samples without draining the battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1081", "title": "Ping-Pong DMA Architecture for Continuous Audio Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the data movement strategy using DMA to ensure zero audio samples are dropped while the CPU computes the inference, and what are the required buffer sizes and memory layout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1082", "title": "Energy-Efficient Audio Pipeline Design on Corstone-300", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Corstone-300 partition CPU, NPU, and SRAM work to minimize energy for always-on keyword spotting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1083", "title": "Architecting Sub-4-bit Keyword Spotting", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate and execute a sub-4-bit extreme quantization strategy on a processor that only supports INT8 SIMD instructions?", "chain_ids": ["tinyml-chain-auto-secondary-009-08"], "chain_positions": {"tinyml-chain-auto-secondary-009-08": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1084", "title": "Architecting Graceful Degradation for ESP32-S3 Voice Commands", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a smart speaker stage SRAM and PSRAM models for WiFi loss and low-power fallback?", "chain_ids": ["tinyml-chain-auto-secondary-012-19"], "chain_positions": {"tinyml-chain-auto-secondary-012-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1085", "title": "AOT Compiler Memory Architecture for STM32F4", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an AOT compiler fuse operators and plan memory so a 300 KB activation model fits in STM32F4 SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-004-31"], "chain_positions": {"tinyml-chain-auto-secondary-004-31": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1086", "title": "Architecting Knowledge Distillation for KWS on ESP32-S3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should an ESP32-S3 KWS student be distilled so weights and activations stay within the 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1087", "title": "Dual-Core Latency Pipeline for BLE Wake-Word", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should nRF5340 cores pipeline audio, MFCC, inference, and BLE to meet a 150 ms wake-word budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1088", "title": "Corstone-300 Memory Allocation for KWS", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 allocate the 512 KB of shared SRAM and external Flash to maximize Ethos-U55 utilization without hitting OOM errors?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 2}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1089", "title": "Corstone-300 NPU Delegation and Memory Architecture", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the conversion and delegation strategy to handle this CPU-NPU context switch while ensuring the system operates within the strict 512 KB SRAM limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1090", "title": "ESP32-S3 Audio Wake Word System Architecture", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should ESP32-S3 manage SRAM, PSRAM, cores, and audio buffers for an always-on wake-word system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1091", "title": "Hardware-Aware NAS Design for Cortex-M4", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a Cortex-M4 NAS pipeline enforce a 50 ms latency budget and 200 KB activation limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1092", "title": "Dual-Core Memory-Aware Operator Scheduling on ESP32-S3", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the microcontroller schedule Conv2D and depthwise operators to avoid materializing a 400 KB intermediate tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1093", "title": "Sparse Wake-Word Design for Nordic nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a pruning and sparsity strategy that leverages the hardware's architecture to meet memory and power constraints without degrading accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1094", "title": "ESP32-S3 Audio Wake-Word Quantization Pipeline", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate and design the optimal quantization strategy to fit memory constraints while preserving accuracy?", "chain_ids": ["tinyml-chain-auto-secondary-004-16"], "chain_positions": {"tinyml-chain-auto-secondary-004-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1095", "title": "ESP32-S3 Real-Time Audio Anomaly Detection Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should ESP32-S3 isolate real-time audio inference from WiFi jitter to guarantee a 10 ms deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1096", "title": "On-Device Guardrails for Predictive Maintenance", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the Corstone-300 safety guardrail be implemented as a secondary model on the Ethos-U55 NPU or as deterministic physical-bounds checks on the Cortex-M7, and why?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1097", "title": "Architecting a Vision Pipeline on Corstone-300", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Corstone-300 partition M7 post-processing and U55 convolutions to improve roofline utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1098", "title": "Continuous Audio Ingestion Pipeline on Corstone-300", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 pipeline be architected to ingest 16 kHz audio without dropping frames while sharing 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1099", "title": "CNN Backbone Architecture for Corstone-300", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should CNN convolution blocks be architected to maximize Ethos-U55 utilization without causing OOM errors in the 512 KB shared SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1100", "title": "Solar-Powered Acoustic Event Duty Cycle Design", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What sleep and wake schedule lets a solar-powered STM32F4 acoustic detector monitor continuously within its energy budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1101", "title": "ESP32-S3 Dual-Model Tensor Arena Architecture", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design the memory planning, operator scheduling, and arena placement between SRAM and PSRAM to meet real-time latency requirements without exceeding the memory budget?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 2}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1102", "title": "Shared SRAM Contention Side-Channel", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can an unprivileged M7 telemetry task extract model structure from Ethos-U55 SRAM contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1103", "title": "ESP32-S3 PSRAM Bandwidth Compute Bottleneck", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware constraint is causing your compute estimation to be inaccurate, and how do you diagnose the root cause?", "chain_ids": ["tinyml-chain-auto-secondary-004-05"], "chain_positions": {"tinyml-chain-auto-secondary-004-05": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1104", "title": "ESP32-S3 Camera Sensor Domain Gap Diagnosis", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What dataset curation failure occurred, and how does the hardware architecture explain this symptom?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1105", "title": "NPU Garbage Predictions After CPU Preprocessing", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Ethos-U55 read random features after the M7 computes MFCCs in a 16 KB DMA buffer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1106", "title": "Diagnosing Power Drain from Memory Access", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a fully connected Cortex-M4 model use more energy than a convolutional model with the same 5M MACs?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1107", "title": "Ethos-U55 W4A8 Fallback Stalls", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a W4A8 CNN on Corstone-300 run on the 480 MHz M7 instead of the Ethos-U55, and what is the hardware-level root cause?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1108", "title": "Diagnosing WDT Resets During Peak BLE Transmissions", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do nRF5340 WDT resets correlate with BLE retries, and what degradation ladder should prevent them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1109", "title": "Diagnosing BLE Drops from Dense Distilled Models", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a dense distilled model on nRF5340 drop BLE 5.3 connections and trigger watchdog resets during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1110", "title": "Diagnosing Fully Connected Layer Bottlenecks on STM32F4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a CMSIS-NN fully connected layer on the STM32F4 run 4x slower than its theoretical INT8 SIMD performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1111", "title": "Diagnosing Operator Fallback in TFLite Micro Conversion", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this latency anomaly during the conversion and deployment pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1112", "title": "Diagnosing SRAM Exhaustion on nRF5340 with BLE", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which nRF5340 SRAM users were missing from the team's 180 KB tensor-arena estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1113", "title": "Diagnosing NAS SRAM Constraints on Corstone-300", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the Corstone-300 NAS winner HardFault despite a 380 KB activation estimate, and what memory overhead was missed?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1114", "title": "SRAM Exhaustion in Multi-Branch CNN Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the branched nRF5340 CNN HardFault even though the largest individual activation is only 140 KB?", "chain_ids": ["tinyml-chain-auto-secondary-010-20"], "chain_positions": {"tinyml-chain-auto-secondary-010-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1115", "title": "ESP32-S3 Brownout During Concurrent Vision Inference and WiFi TX", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does ESP32-S3 vision inference plus WiFi TX trigger brownout on an 80% charged LiPo battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1116", "title": "Diagnosing CMSIS-NN SIMD Underutilization", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 500K-MAC 1x1 convolution take 15 ms on a 168 MHz STM32F4 despite CMSIS-NN SIMD?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1117", "title": "INT8 Per-Tensor Degradation in Depthwise Convolutions", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What quantization fundamental causes massive activation deviations in depthwise convolution layers under per-tensor INT8 PTQ?", "chain_ids": ["tinyml-chain-auto-secondary-004-17"], "chain_positions": {"tinyml-chain-auto-secondary-004-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1118", "title": "Diagnosing Bias Amplification in Compressed Audio Models", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this demographic performance disparity and explain how hardware constraints contributed to the symptom?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1119", "title": "Diagnosing Sensor Buffer Overrun During Inference", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 4 kHz accelerometer DMA ping-pong buffer corrupt windows during a 35 ms Cortex-M4 inference?", "chain_ids": ["tinyml-chain-auto-secondary-013-29"], "chain_positions": {"tinyml-chain-auto-secondary-013-29": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1120", "title": "Nordic nRF5340 SRAM Crash During Convolution", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the model HardFault during the first optimized convolution despite a 140 KB tensor arena estimate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1121", "title": "NPU to CPU Porting Power Diagnosis", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the nRF5340 port drain a coin cell faster than the old NPU design despite lower peak current?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1122", "title": "Partitioning Workloads on Cortex-M7 and Ethos-U55", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Corstone-300 architecture is optimal given the shared SRAM constraints, and what are the system-level latency tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1123", "title": "Mitigating DPA Attacks on ESP32-S3 Wake-Word Models", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which ESP32-S3 DPA defense keeps a 15M-MAC wake-word model within real-time latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1124", "title": "Real-time Compute Estimation for Sensor Data", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which INT8 model meets the 20ms STM32F4 sensor stream budget when operating at 50% MAC utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1125", "title": "On-Device Active Learning Curation for STM32F4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which on-device data selection strategy is best given the STM32F4's hardware constraints?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 2}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1126", "title": "DMA Buffer Placement vs Activation Spilling", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate both designs and identify the optimal choice for overall system throughput.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1127", "title": "Energy Tradeoffs of LUT vs On-the-Fly Computation", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which approach is more energy-efficient per operation and how it impacts the overall system power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1128", "title": "Evaluating 4-bit vs INT8 on ESP32-S3", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate INT8 versus 4-bit weight-only quantization for latency and power, and which approach is superior for this specific architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1129", "title": "Degradation Strategy for Ethos-U55 Anomaly Detection", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What graceful degradation strategy should keep Corstone-300 anomaly detection fail-operational during NPU thermal shutdown?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1130", "title": "Evaluating Distillation vs Pruning for Ethos-U55 Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should a Corstone-300 wake-word model use 40% unstructured pruning or a dense 350 KB distilled student to fit 512 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1131", "title": "Evaluating End-to-End Latency Tradeoffs on Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which pipeline provides better end-to-end latency, and how do the preprocessing and inference components contribute to the total time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1132", "title": "XIP vs DMA Paging on nRF5340", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the latency and power trade-offs between XIP and DMA double-buffering for weights on the nRF5340?", "chain_ids": ["tinyml-chain-auto-026-04"], "chain_positions": {"tinyml-chain-auto-026-04": 1}, "chain_tiers": {"tinyml-chain-auto-026-04": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1133", "title": "Evaluating SRAM Constraints on Corstone-300", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model (A or B) should be selected, and how does the shared memory architecture influence this decision on the Corstone-300?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1134", "title": "Evaluating Hardware-Aware NAS for ESP32-S3", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which model is the better architectural choice for a battery-powered device and what are the system-level tradeoffs?", "chain_ids": ["tinyml-chain-auto-secondary-011-21"], "chain_positions": {"tinyml-chain-auto-secondary-011-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1135", "title": "Corstone-300 Operator Cascading and CPU-NPU Pipelining", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should Corstone-300 use Ethos-U55 layer cascading or sequential SRAM writes for a Conv2D-Depthwise-Dense chain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1136", "title": "Evaluate Race-to-Sleep vs DVFS for Cortex-M4 KWS", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which power strategy uses less energy for Cortex-M4 keyword spotting, race-to-sleep or DVFS, and by how much?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1137", "title": "Evaluating Profiling Strategies for BLE Audio on nRF5340", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which profiling method is better without disrupting the BLE timing on the 64 MHz network core?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1138", "title": "Evaluating Pruning Strategies on nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option do you deploy to minimize active power draw (~5mA) and satisfy memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1139", "title": "Evaluating On-Device Privacy Guardrails for Audio ML", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which privacy guardrail approach fits the SRAM and power limits while enabling demographic false-positive audits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1140", "title": "nRF5340 Roofline Memory Hierarchy Tradeoffs", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate both models using a roofline analysis to determine which is better for minimizing active time and maintaining the ~5mA power constraint.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1141", "title": "Dual-Core BLE Streaming Architecture Evaluation", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which architecture is better for maximizing battery life (~5mA active current constraint) while ensuring no dropped packets during high-throughput BLE streaming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1142", "title": "Ethos-U55 vs CPU Tensor Placement Tradeoffs", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory planning approach do you choose and how does it impact your SRAM peak usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1143", "title": "PTQ vs QAT fallback on Ethos-U55", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should Corstone-300 keep sensitive layers FP32 on the M7 or use full INT8 QAT for the Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1144", "title": "Evaluating CNN Data Layouts for Cortex-M4 Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model do you choose to guarantee the 5ms real-time deadline, and how does the architectural tradeoff affect worst-case execution time (WCET)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1145", "title": "Estimating Energy Per Inference on Nordic nRF5340", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy does one nRF5340 inference consume when 12.8 million cycles run at 5 mA and 3 V?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1146", "title": "Corstone-300 NPU Latency Profiling Math", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical Ethos-U55 latency for the 1.2M-MAC wake-word model once CPU fallback is eliminated?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1147", "title": "Guardrail Latency Budget on Ethos-U55", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum theoretical MAC complexity the guardrail model can support to mathematically guarantee it meets the latency deadline?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1148", "title": "Roofline Ridge Point Calculation on Ethos-U55", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the Ethos-U55 peak GOPS, SRAM roofline ridge point, and bottleneck for a 16 OP/byte layer?", "chain_ids": ["tinyml-chain-auto-secondary-013-25"], "chain_positions": {"tinyml-chain-auto-secondary-013-25": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1149", "title": "Audio Ingestion Memory and Cycle Budgeting", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What circular audio buffer size and M7 cycle budget are needed for zero-copy 250 ms stride processing on Corstone-300?", "chain_ids": ["tinyml-chain-auto-secondary-013-27"], "chain_positions": {"tinyml-chain-auto-secondary-013-27": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1150", "title": "ESP32-S3 Tensor Arena Sizing with WiFi", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the required size of the flat tensor arena and determine if it can be allocated entirely in the internal SRAM?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 0}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1151", "title": "ESP32-S3 Vector Extension Speedup", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the theoretical minimum inference latency using standard scalar instructions (1 MAC per cycle) versus the specialized INT8 vector extensions (16 MACs per cycle)?", "chain_ids": ["tinyml-chain-auto-secondary-012-15"], "chain_positions": {"tinyml-chain-auto-secondary-012-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1152", "title": "Adversarial Detection Latency on Cortex-M4", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical latency does a 336k-MAC INT8 adversarial detector add on Cortex-M4 with CMSIS-NN SIMD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1153", "title": "Estimating Inference Latency and Energy on nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected latency for a single inference and the active energy consumed if the system draws ~5mA at 3.0V?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1155", "title": "2-Bit Weight Unpacking and Execution on Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total Flash footprint for these weights and the theoretical minimum compute cycles for the MAC operations?", "chain_ids": ["tinyml-chain-auto-secondary-009-08"], "chain_positions": {"tinyml-chain-auto-secondary-009-08": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1156", "title": "Ethos-U55 Compiler Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many cycles does a 10% Cortex-M7 fallback add during Ethos-U55 inference, and what share of time is CPU-bound?", "chain_ids": ["tinyml-chain-auto-027-02"], "chain_positions": {"tinyml-chain-auto-027-02": 0}, "chain_tiers": {"tinyml-chain-auto-027-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1157", "title": "ESP32-S3 Peak Memory Footprint Estimation", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the peak SRAM footprint required for this model's weights and the bottleneck layer's activations, and does it fit in internal SRAM alongside 120 KB of RTOS overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1158", "title": "Watchdog Timeout Calculation for Corstone-300", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total execution time in milliseconds to establish the minimum safe watchdog timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1159", "title": "Theoretical Latency Comparison of CPU vs NPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the theoretical minimum CPU and Ethos-U55 execution times for a 2.4M-MAC Corstone-300 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1160", "title": "Randomized Smoothing Latency on ESP32-S3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much latency does a five-pass randomized smoothing defense add on ESP32-S3, and does it meet a 150 ms deadline?", "chain_ids": ["tinyml-chain-auto-secondary-011-18"], "chain_positions": {"tinyml-chain-auto-secondary-011-18": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1161", "title": "MAC Reduction from Depthwise Separable Convolution", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the exact theoretical reduction factor in multiplication-accumulation operations (MACs) achieved by this architectural change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1162", "title": "Estimating Inference Latency for INT8 Convolution on Cortex-M4", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical minimum latency of the 32-to-64 channel INT8 convolution on Cortex-M4 running at 168 MHz?", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1163", "title": "Active Learning Buffer Capacity on STM32F4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many complete 1.5-second 8-bit audio samples can the STM32F4 buffer in remaining flash for active learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1164", "title": "ESP32-S3 PSRAM to SRAM DMA Streaming Latency", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the minimum latency incurred by DMA-streaming 4 MB of ESP32-S3 weights from PSRAM to SRAM at 40 MB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1165", "title": "Calculating Footprint for 3-bit Quantized Weights", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the exact memory required in kilobytes (1 KB = 1024 bytes) to store these 3-bit quantized weights, assuming they are tightly packed without any padding.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1166", "title": "Calculate Fallback Model MAC Budget on Cortex-M7", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the system requires a strict latency budget of 15ms and the Cortex-M7 achieves 0.5 MACs per cycle, what is the maximum MAC limit for the fallback model?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1167", "title": "Distillation vs Pruning Latency on Ethos-U55", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the inference latency in milliseconds for unstructured pruning vs distillation to demonstrate why distillation beats pruning?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1168", "title": "Calculate End-to-End Wake Word Pipeline Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical end-to-end Cortex-M4 wake-word latency including MFCC, inference, and post-processing assuming 100% MAC utilization?", "chain_ids": ["tinyml-chain-auto-secondary-003-09"], "chain_positions": {"tinyml-chain-auto-secondary-003-09": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1169", "title": "SRAM Allocation and Peak Memory Calculation on nRF5340", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much SRAM remains for the ML arena, and will this deployment fit?", "chain_ids": ["tinyml-chain-auto-026-04"], "chain_positions": {"tinyml-chain-auto-026-04": 0}, "chain_tiers": {"tinyml-chain-auto-026-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1170", "title": "TFLite Micro Memory Allocation on nRF5340", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the remaining available Flash and SRAM on the device after deploying and running the converted model?", "chain_ids": ["tinyml-chain-auto-027-04"], "chain_positions": {"tinyml-chain-auto-027-04": 0}, "chain_tiers": {"tinyml-chain-auto-027-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1171", "title": "SRAM Constraint Calculation for Ethos-U55 CNN Deployment", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum number of model parameters you can support if the entire model must reside in SRAM to meet latency targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1172", "title": "Layer Fusion SRAM Calculation", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the sequential and fused peak SRAM requirements for the two-layer convolution subgraph?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1173", "title": "Calculate Inference Latency from Trace Cycles", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total latency for 1.5M MAC cycles plus 600K SRAM overhead cycles on a 128 MHz nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1174", "title": "Calculate Latency of Pruning Methods on Cortex-M4", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the exact inference latency in microseconds and the parameter flash footprint in kilobytes for both pruning strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1175", "title": "Privacy Guardrail Energy Budget Calculation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the daily energy budget (in millijoules) explicitly spent on enforcing this responsible AI requirement.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1176", "title": "Calculate Roofline Throughput on nRF5340", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What roofline throughput can the nRF5340 achieve for a 1D convolution with 0.125 MACs per byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1177", "title": "Vibration Ingestion Buffer Sizing", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum sleep duration in milliseconds for the app core before it must wake up to process a filled buffer?", "chain_ids": ["tinyml-chain-auto-secondary-013-29"], "chain_positions": {"tinyml-chain-auto-secondary-013-29": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1178", "title": "Watchdog Timer Calculation for ML Inference", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the minimum WDT timeout interval (in milliseconds) required to ensure the WDT does not falsely trigger during a single inference pass, adding a 20% safety margin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1179", "title": "Calculate Energy Per Inference on Cortex-M4", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy does one wake-word inference consume at 1.5 INT8 MACs per cycle and 40 mA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1180", "title": "Calculate Vision Pipeline Slack Time", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much frame slack remains for the 30 FPS Corstone-300 object detector after M7 preprocessing and NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1181", "title": "Heterogeneous Vision Architecture on ESP32-S3", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why should the security camera use an external SPI NPU, and how do communication costs affect the design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1182", "title": "Adversarial Defense for Wake-Word on ARM Cortex-M4", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the STM32F4 wake-word system defend against adversarial audio within a strict 50 ms latency budget?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1183", "title": "Continuous Acoustic Wake-Word on nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you partition the workload between the 128 MHz app core and 64 MHz network core to meet the 100 µA limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1184", "title": "On-Device Active Learning for Audio", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should the nRF5340 architect data buffering, uncertainty compute, and radio scheduling to strictly respect the 256 KB SRAM, 1 MB flash, and ~5mA active power constraints?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 3}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1185", "title": "Continuous DMA Ingestion and Bus Contention", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the memory architecture and data movement strategy to guarantee zero-copy ingestion, zero data loss, and minimal memory bus contention between the DMA controller and the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1186", "title": "Solar-Powered Audio on ARM Cortex-M4", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can a solar-powered Cortex-M4 audio detector stay under a continuous 1 mW budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1187", "title": "Energy-Optimal Architecture for Corstone-300", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a novel model architecture and execution schedule that trades off computational complexity for memory locality to minimize total energy per inference?", "chain_ids": ["tinyml-chain-auto-secondary-012-18"], "chain_positions": {"tinyml-chain-auto-secondary-012-18": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1188", "title": "Ternary Weight Transformer on Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a ternary-weight KWS transformer on Cortex-M4 store and compute 2-bit weights within Flash and SRAM limits?", "chain_ids": ["tinyml-chain-auto-secondary-009-08"], "chain_positions": {"tinyml-chain-auto-secondary-009-08": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1189", "title": "Decentralized Anomaly Detection via Federated Averaging on nRF5340", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Architect a cross-device federated learning system that handles non-IID data while strictly adhering to nRF5340 memory and power budgets.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1190", "title": "Asymmetric Dual-Model Degradation on ESP32-S3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should ESP32-S3 degrade from a PSRAM-streamed CNN to an SRAM-resident anomaly model under low battery or PSRAM throttling?", "chain_ids": ["tinyml-chain-auto-secondary-012-19"], "chain_positions": {"tinyml-chain-auto-secondary-012-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1191", "title": "Novel Distillation Strategy for ESP32-S3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you construct the student architecture and distillation loss function to fit within 512 KB SRAM while maximizing the Xtensa cores?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1192", "title": "Dual-Core Anomaly Latency Decomposition", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a dual-core MCU split sensing, FFT, CNN inference, IPC, and BLE transmission to meet a 40 ms anomaly alert budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1193", "title": "Ethos-U55 Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Corstone-300 stream Flash-resident CNN weights and schedule SRAM to keep the Ethos-U55 utilized?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 3}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1194", "title": "Ethos-U55 Delegation and Shared SRAM Optimization", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a model conversion and runtime delegation strategy to maximize inference throughput while preventing SRAM exhaustion during CPU-NPU context switches?", "chain_ids": ["tinyml-chain-auto-027-02"], "chain_positions": {"tinyml-chain-auto-027-02": 2}, "chain_tiers": {"tinyml-chain-auto-027-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1195", "title": "ESP32-S3 Memory Hierarchy Design for Always-On Audio", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should ESP32-S3 keep the always-on audio inference critical path in SRAM while prefetching PSRAM weights?", "chain_ids": ["tinyml-chain-auto-secondary-003-16"], "chain_positions": {"tinyml-chain-auto-secondary-003-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1196", "title": "Hardware-Aware NAS Design for ARM Cortex-M4 Keyword Spotting", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should a Cortex-M4 hardware-aware NAS encode SRAM, Flash, latency, and CMSIS-NN SIMD constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1197", "title": "Dual-Core Operator Scheduling for Heterogeneous Memory", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the ESP32-S3 schedule the CNN-LSTM cascade across cores and memory tiers to fit 512 KB SRAM and 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1198", "title": "Asymmetric Dual-Core ML Power Partitioning", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system partition DMA, DSP, ML inference, BLE, and sleep states to meet a one-year fall-detection battery budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1199", "title": "NPU-CPU Bus Contention and Trace Profiling in Corstone-300", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a profiling strategy to isolate the root cause and propose an architectural modification to the model or memory layout to achieve the target latency?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1200", "title": "NPU-Aware Sparsity on Corstone-300", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should a 750 KB acoustic model be pruned to fit Corstone-300's 512 KB SRAM and preserve Ethos-U55 utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1201", "title": "Mixed-Precision Audio Keyword Spotting on ESP32-S3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you architect the calibration pipeline, partition the network layers for different precisions, and ensure the vector extensions are maximally utilized despite the mixed bit-widths?", "chain_ids": ["tinyml-chain-auto-secondary-004-16"], "chain_positions": {"tinyml-chain-auto-secondary-004-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1202", "title": "Dual-Core Real-Time Inference Architecture", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should nRF5340 guarantee a 1 ms vibration inference deadline while streaming BLE alerts without causing connection drops?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 3}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1203", "title": "On-Device PII Redaction Guardrail", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Corstone-300 detect and redact spoken PII on-device while fitting DSP, CNN, and buffers in 512 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1204", "title": "Corstone-300 Roofline Optimization for Micro-Transformers", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Corstone-300 fuse micro-transformer attention to move from memory-bound to compute-bound execution within 512 KB?", "chain_ids": ["tinyml-chain-auto-secondary-013-25"], "chain_positions": {"tinyml-chain-auto-secondary-013-25": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1205", "title": "ESP32-S3 ASIL-B Fallback Architecture", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should ESP32-S3 partition safety and ML work to satisfy an ASIL-B 50 ms fault-tolerant interval?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1206", "title": "Real-Time Vibration Ingestion on Corstone-300", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system use DMA and zero-copy feature computation to avoid dropped 16 kHz vibration frames?", "chain_ids": ["tinyml-chain-auto-secondary-013-27"], "chain_positions": {"tinyml-chain-auto-secondary-013-27": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1207", "title": "NPU-Aware CNN Design for Ethos-U55", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an NPU visual wake-word CNN be redesigned to maximize MAC utilization without SRAM spilling or MCU fallback?", "chain_ids": ["tinyml-chain-auto-016-08"], "chain_positions": {"tinyml-chain-auto-016-08": 2}, "chain_tiers": {"tinyml-chain-auto-016-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1208", "title": "Ethos-U55 Compiler Tiling for Operator Fallback", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can a compiler synthesize tiling and pipelining between the U55 and M7 to avoid spilling a 300 KB activation to external memory?", "chain_ids": ["tinyml-chain-auto-secondary-004-33"], "chain_positions": {"tinyml-chain-auto-secondary-004-33": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1209", "title": "Asymmetric OTA Architecture for Dual-Core ML", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an edge MCU structure a fail-safe OTA update for firmware and model payloads within exactly 1 MB Flash?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1210", "title": "Cross-Hierarchy Tensor Arena for ESP32-S3", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a custom static memory planner and operator schedule that partitions the tensor arena across SRAM and PSRAM to ensure real-time execution?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 3}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1211", "title": "SIMD vs NPU for STM32F4 Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the microcontroller add an external NPU or use CMSIS-NN SIMD to meet the 20 ms CNN deadline, and why?", "chain_ids": ["tinyml-chain-auto-secondary-012-16"], "chain_positions": {"tinyml-chain-auto-secondary-012-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1212", "title": "Optimizing Side-Channel Defenses on nRF5340", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can you optimize this side-channel defense to maintain security while reducing compute overhead to meet power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1213", "title": "Ethos-U55 Compute Utilization and Bottleneck Analysis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you quantify the compute utilization bottleneck and the expected latency if the feature maps are optimized to fit entirely within SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-004-02"], "chain_positions": {"tinyml-chain-auto-secondary-004-02": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1214", "title": "Optimizing Window Annotations for NPU SRAM", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you adjust the annotation workflow to resolve this hardware bottleneck, and what is the quantified impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1215", "title": "Zero-Copy Audio DMA on nRF5340", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much does zero-copy EasyDMA save in terms of SRAM and CPU latency when the nRF5340 currently copies a 16 KB audio buffer before each inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1216", "title": "ESP32-S3 Keyword Spotting Energy Bottleneck Optimization", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should ESP32-S3 reduce keyword-spotting energy when PSRAM weight reads dominate each inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1217", "title": "Audio Model Degradation Ladder on Cortex-M4", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the Cortex-M4 audio pipeline shed work when RTOS network activity leaves too few cycles for the primary model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1218", "title": "Distillation vs Pruning for INT8 SIMD", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the 80% sparse Cortex-M4 model miss the 20 ms deadline, and how does a dense distilled INT8 model fix it?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1219", "title": "Corstone-300 Vision Pipeline Bottleneck", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 60 ms non-NPU latency in the Corstone-300 vision pipeline, and how should preprocessing and weight fetching be optimized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1220", "title": "ESP32-S3 Operator Fallback Optimization", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this conversion bottleneck and quantify the performance gained by modifying the model format to use a supported operator?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 2}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1221", "title": "Optimizing Peak SRAM Footprint for CNN on STM32F4", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 400 KB-weight STM32F4 CNN OOM at runtime, and how should the bottleneck activation layer be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1222", "title": "Hardware-Aware NAS Constraints on Dual-Core MCU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the system-level interactions causing OOM faults and quantify the new constraints for your NAS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1223", "title": "Depthwise and Pointwise Convolution Scheduling for SRAM Optimization", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose this bottleneck and quantify the memory savings of an optimized scheduling approach on a memory-constrained MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1224", "title": "Delta OTA Optimization for Ethos-U55 NPU Models", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this storage bottleneck and quantify an optimization strategy to perform reliable, in-place OTA updates without external memory?", "chain_ids": ["tinyml-chain-auto-secondary-004-12"], "chain_positions": {"tinyml-chain-auto-secondary-004-12": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1225", "title": "SRAM Bottlenecks and Structured Pruning on ESP32-S3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning of an 800 KB INT8 model fail to fit in 512 KB SRAM, and how does it affect ESP32-S3 performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1226", "title": "Symmetric Weights and SIMD Utilization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do asymmetric INT8 weights slow Cortex-M4 pointwise convolutions, and how should quantization be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1227", "title": "Optimizing OOD Safety Guardrails on ESP32-S3", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the ESP32-S3 replace a 10-component FP32 GMM OOD guardrail that exceeds the 50 ms budget?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1228", "title": "ESP32-S3 Roofline Analysis for Wake-Word", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using roofline analysis, diagnose the compute vs. memory bottleneck and quantify the exact speedup if you permanently pin these weights to the internal SRAM instead of fetching them from PSRAM.", "chain_ids": ["tinyml-chain-auto-secondary-013-24"], "chain_positions": {"tinyml-chain-auto-secondary-013-24": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1229", "title": "Optimizing Inference for Watchdog Deadlines", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can CMSIS-NN SIMD keep inference and memory self-test within the 10 ms watchdog window?", "chain_ids": ["tinyml-chain-auto-secondary-008-06"], "chain_positions": {"tinyml-chain-auto-secondary-008-06": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1230", "title": "DMA Ping-Pong Buffering for Continuous Sensor Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the ingestion bottleneck, quantify the data loss, and determine the architectural fix required?", "chain_ids": ["tinyml-chain-auto-secondary-013-28"], "chain_positions": {"tinyml-chain-auto-secondary-013-28": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1231", "title": "In-Place Tensor Arena Optimization for Residual Blocks", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this bottleneck and quantify an optimization to the memory planner to resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1232", "title": "Federated Learning Memory Bottleneck on Corstone-300", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should federated learning on Corstone-300 avoid SRAM OOM during local updates and global model reception?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1233", "title": "Optimizing CNN Inference on ESP32-S3 via INT8 Lowering", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does INT8 lowering fix the ESP32-S3 keyword CNNs PSRAM and latency bottlenecks, and what speedup results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1234", "title": "ESP32-S3 SRAM vs PSRAM Activation Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the bottleneck and quantify how reallocating memory resolves the latency issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1235", "title": "Corstone-300 NPU Power Bottleneck and DVFS Optimization", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose this power bottleneck and quantify a power optimization strategy to stay under the 50mW cap without missing the 50ms latency SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1236", "title": "Mitigating Shared SRAM Contention in U55", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Ethos-U55 latencies spike to 12 ms when M7 sensor interrupts log into the shared 512 KB SRAM, and how is it resolved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1237", "title": "MobileNet Partitioning for ESP32-S3 Memory Hierarchy", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should MobileNetV2 execution on ESP32-S3 partition weights and activations between PSRAM and SRAM to minimize latency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1238", "title": "On-Device Hard Negative Mining for ESP32-S3", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should ESP32-S3 perform on-device hard-negative mining without exhausting SRAM or radio power?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 3}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1239", "title": "Zero-Copy DMA Pipeline for Ethos-U55", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 architecture utilize DMA to handle 60 FPS visual inference without inducing memory bus contention?", "chain_ids": ["tinyml-chain-auto-secondary-010-17"], "chain_positions": {"tinyml-chain-auto-secondary-010-17": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1240", "title": "Arithmetic Intensity and SRAM Energy Tradeoffs", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can depthwise separable convolutions hurt STM32F4 battery life, and how should arithmetic intensity guide layer choices?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1241", "title": "Sub-4-bit Quantization Tradeoffs for Keyword Spotting on Corstone-300", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you evaluate 2-bit weight quantization for a keyword spotter that must fit a 1.2M-parameter CNN into 512 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1242", "title": "Federated Averaging Memory and Communication Sizing on ESP32-S3", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the ESP32-S3 firmware manage memory buffers and communication schedules to process a 150K-parameter CNN update efficiently?", "chain_ids": ["tinyml-chain-auto-025-06"], "chain_positions": {"tinyml-chain-auto-025-06": 1}, "chain_tiers": {"tinyml-chain-auto-025-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1243", "title": "Graceful Degradation for Anomaly Detection on nRF5340", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design this fallback architecture and concretely size the memory and compute allocations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1244", "title": "Sizing a Distilled Keyword Spotting Model for nRF5340", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the strict platform constraints and size the student model's parameters and peak activations concretely?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1245", "title": "ESP32-S3 Speech-to-Intent Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate the end-to-end latency by decomposing it into preprocessing, TTFT, TPOT, and network transmission.", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1246", "title": "Evaluating Flash vs SRAM Execution for Cortex-M4 Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a Cortex-M4 acoustic anomaly model read 700 KB of weights directly from Flash or page them into SRAM with DMA?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 4}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1247", "title": "TFLite Micro Operator Fallback on Cortex-M4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you handle a TFLite Micro layer that falls back from CMSIS-NN to a slow reference kernel on a constrained microcontroller?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 4}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1248", "title": "Sizing an Audio Wakeword Model for Nordic nRF5340", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you size SRAM and flash for an always-on nRF5340 wakeword detector while reserving resources for Zephyr, BLE, and audio buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1249", "title": "SRAM-Constrained NAS on Ethos-U55", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you formulate the NAS memory constraint function to ensure the discovered models can execute without SRAM overflow?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1250", "title": "Dual-Core Operator Scheduling on Nordic nRF5340", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the operator execution schedule to leverage the dual cores, apply layer fusion, and optimize memory reuse to fit within the strict SRAM limits?", "chain_ids": ["tinyml-chain-auto-secondary-010-20"], "chain_positions": {"tinyml-chain-auto-secondary-010-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1251", "title": "ESP32-S3 Flash Partitioning for Model OTA", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the partition layout constraints and propose a deployment strategy that ensures safe OTA updates without exceeding the 8 MB flash?", "chain_ids": ["tinyml-chain-auto-secondary-004-13"], "chain_positions": {"tinyml-chain-auto-secondary-004-13": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1252", "title": "Evaluating Race-to-Sleep vs DVFS on ESP32-S3", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an ESP32-S3 acoustic detector use 240 MHz race-to-sleep or 80 MHz underclocking to minimize energy per inference?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1253", "title": "Optimizing SIMD Utilization and Memory Stalls on Cortex-M4", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you diagnose the root cause of this poor utilization, and what architectural or memory-level changes do you make to hit your 20ms target?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1254", "title": "Structured vs Unstructured Pruning on Cortex-M4", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate this proposal against an alternative 50% structured channel pruning approach, considering the specific architectural constraints of the target hardware?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1255", "title": "Evaluating Quantization Granularity on nRF5340", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an nRF5340 wake-word model use per-tensor or per-channel INT8 quantization for depthwise layers under a 50 ms budget?", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1256", "title": "Sizing an On-Device Anti-Spoofing Guardrail", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the maximum allowable capacity (in memory and MACs) for the new INT8 guardrail model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1257", "title": "Roofline Analysis of Depthwise Convolutions", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is a 3x3 depthwise convolution on a Cortex-M4 compute-bound or memory-bound under a roofline analysis, and what is its maximum throughput?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1258", "title": "Architecting ML Safety on Dual-Core nRF5340", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you architect watchdogs, MPU protection, and task scheduling so nRF5340 ML inference cannot compromise safety fault handling?", "chain_ids": ["tinyml-chain-auto-secondary-008-08"], "chain_positions": {"tinyml-chain-auto-secondary-008-08": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1259", "title": "Audio Streaming Ping-Pong Sizing on Cortex-M4", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you size the DMA ping-pong audio buffer and remaining compute budget for a Cortex-M4 Mel spectrogram pipeline?", "chain_ids": ["tinyml-chain-auto-secondary-013-29"], "chain_positions": {"tinyml-chain-auto-secondary-013-29": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1260", "title": "Tensor Arena Optimization for BLE-Enabled nRF52840 Edge Devices", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the tradeoffs of different memory planning strategies to resolve this constraint without degrading the ~5mA active power budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1261", "title": "nRF5340 Always-On Wake Word Energy Budgeting", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the required duty cycle and sleep state power constraints to make this system feasible?", "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 3}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1262", "title": "Optimizing Convolution Lowering for Cortex-M4 SIMD", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a graph compiler lower depthwise convolutions to exploit Cortex-M4 CMSIS-NN SIMD without memory stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1263", "title": "Keyword Spotting Deadlines on ESP32-S3", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you partition the memory and schedule the inference to guarantee the 20ms deadline without dropping audio frames or breaking the BLE connection?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 2}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1264", "title": "Identifying Side-Channel Attacks on Edge NPUs", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What physical side-channel attack can reveal proprietary Ethos-U55 model weights through power or electromagnetic traces during inference?", "chain_ids": ["tinyml-chain-auto-secondary-011-16"], "chain_positions": {"tinyml-chain-auto-secondary-011-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1265", "title": "ESP32-S3 Memory Hierarchy for Model Deployment", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the specific sizes of the internal SRAM and external PSRAM on the ESP32-S3, and why does this matter for compute performance?", "chain_ids": ["tinyml-chain-auto-secondary-004-05"], "chain_positions": {"tinyml-chain-auto-secondary-004-05": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1266", "title": "Data Type Formatting for ESP32-S3 Hardware Acceleration", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What specific numeric data type should the dataset features be mapped to for optimal hardware-accelerated inference on this platform?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 0}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1267", "title": "SRAM vs Compute Energy Cost", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What fundamental fact from the Horowitz energy table should you recall regarding the relative energy cost of SRAM access versus integer compute?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1268", "title": "Defining Fail-Safe vs Fail-Operational on nRF5340", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the difference between fail-safe and fail-operational behavior for an nRF5340 predictive maintenance device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1269", "title": "Knowledge Distillation Soft Targets on nRF5340", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the specific term for the temperature-scaled probability distribution produced by the teacher model that the student model attempts to match?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1270", "title": "ARM Cortex-M4 SRAM and Flash Capacities", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the exact capacities of the on-chip SRAM and Flash memory for this specific platform?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 0}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1271", "title": "Nordic nRF5340 Memory Limits Recall", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the hardware specifications of the Nordic nRF5340, what are its exact SRAM and flash memory capacities, and will this model fit into SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1272", "title": "Identifying Peak NPU Throughput for Corstone-300 Architecture Search", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "To correctly set the upper bound for the search space's latency estimator, what is the configurable range of MAC operations per cycle supported by the Ethos-U55 NPU?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1273", "title": "Nordic nRF5340 Dual-Core Architecture for Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the specific clock frequencies of the application core and the network core on the Nordic nRF5340?", "chain_ids": ["tinyml-chain-auto-secondary-010-20"], "chain_positions": {"tinyml-chain-auto-secondary-010-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1274", "title": "CMOS Dynamic Power Equation Recall", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What CMOS dynamic power equation should you recall when optimizing ESP32-S3 DVFS P-states for an always-on keyword spotter?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1275", "title": "Cycle Count Profiling on ARM Cortex-M4", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the name of the standard ARM CoreSight register used to count clock cycles for precise profiling?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1276", "title": "Standard Transparency Artifacts for Edge Models", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What standard transparency artifact should document intended use, bias evaluation, and hardware limits for an edge audio model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1277", "title": "Peak Compute Derivation for ARM Cortex-M4 Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the peak INT8 multiply-accumulate (MAC) throughput per clock cycle on this specific architecture?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1278", "title": "Continuous Audio Ingestion on ARM Cortex-M4", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific microcontroller hardware peripheral and associated memory layout strategy must you recall and implement to stream this data without overwhelming the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1279", "title": "Nordic nRF5340 Dual-Core Clock Frequency Recall", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the application-core and network-core clock frequencies of the Nordic nRF5340 for mapping BLE and ML workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1280", "title": "Ethos-U55 Supported Quantization Precisions", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which quantization precisions are natively accelerated by the Ethos-U55, and what happens to unsupported sub-4-bit operations?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1281", "title": "Identifying Latency Components on ESP32-S3", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When breaking down the total latency, what term describes the time spent computationally converting raw sensor data into the required input tensor?", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1282", "title": "CMSIS-NN Quantization Format Constraints", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific data type and quantization scheme must the model's weights and activations be converted to during the TFLite conversion process to leverage the CMSIS-NN SIMD instructions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1283", "title": "ESP32-S3 Structured Pruning Impact", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which specific type of pruning (structured or unstructured) must you use to ensure the INT8 vector instructions actually execute fewer cycles?", "chain_ids": ["tinyml-chain-auto-secondary-005-15"], "chain_positions": {"tinyml-chain-auto-secondary-005-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-005-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1284", "title": "Asymmetric Quantization Equation for nRF5340", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When converting weights from FP32 to INT8, what is the standard mathematical equation for asymmetric quantization that maps a real-world floating-point value to a quantized integer value?", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1285", "title": "Ethos-U55 Shared SRAM Architecture", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific memory architecture feature of the Corstone-300 allows the NPU to access the 512 KB SRAM without requiring explicit host-to-device DMA transfers?", "chain_ids": ["tinyml-chain-auto-secondary-010-17"], "chain_positions": {"tinyml-chain-auto-secondary-010-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1286", "title": "Vibration Anomaly Detection Accelerator Selection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can a 168 MHz Cortex-M4 meet a 20 Hz, 15M-MAC vibration anomaly workload, or does the design need an external NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1287", "title": "Secure Wake-Word Pipeline Design for nRF5340", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you specify a secure nRF5340 wake-word pipeline that defends against replay attacks within SRAM, flash, and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1288", "title": "Estimating Inference Latency on Corstone-300", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate Corstone-300 inference latency for a 5M-MAC CNN while accounting for shared-SRAM bandwidth bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1289", "title": "Dataset Specification for Constrained INT8 Acoustic Models", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you specify the curation workflow to ensure the resulting dataset yields a robust INT8-quantized model optimized for the Ethos-U55?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 2}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1290", "title": "nRF5340 DMA Pipeline Design for Audio ML", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 audio ML pipeline use EasyDMA and SRAM buffers to ingest 16 kHz audio without unnecessary CPU wakeups?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1291", "title": "ESP32-S3 Memory Hierarchy Energy Analysis", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which ESP32-S3 model placement strategy is more energy efficient: a 400 KB SRAM-resident model or a 2.5 MB PSRAM model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1292", "title": "Sub-4-bit Quantization Specification for nRF5340 Audio Model", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you specify sub-4-bit weights and INT8 activations so a 1.2M-parameter nRF5340 audio model fits flash and SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-009-07"], "chain_positions": {"tinyml-chain-auto-secondary-009-07": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1293", "title": "Design Graceful Degradation for Cortex-M4 Keyword Spotter", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Cortex-M4 keyword spotter degrade gracefully when BLE interrupts reduce the available CPU window to 15 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1294", "title": "Ethos-U55 Operator Offloading and Fallback Analysis", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the system-level implications of this compilation outcome and specify a graph optimization strategy to satisfy strict latency constraints?", "chain_ids": ["tinyml-chain-auto-secondary-004-33"], "chain_positions": {"tinyml-chain-auto-secondary-004-33": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1295", "title": "Designing a Distilled Keyword Spotter for Cortex-M4", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you design an INT8 distilled keyword-spotting student model for a Cortex-M4 with no FPU and tight SRAM limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1296", "title": "Keyword Spotting Latency Budget on Corstone-300", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose a 50 ms Corstone-300 keyword-spotting latency budget across audio capture, M7 preprocessing, and U55 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1297", "title": "Optimizing Keyword Spotting Memory Allocation on ESP32-S3", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where should ESP32-S3 KWS and speaker-verification weights, activations, and audio buffers reside to minimize latency and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1298", "title": "KWS Memory Architecture on Cortex-M4", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you partition Cortex-M4 SRAM and flash to determine the maximum feasible size of an INT8 depthwise-separable KWS model?", "chain_ids": ["tinyml-chain-auto-secondary-003-14"], "chain_positions": {"tinyml-chain-auto-secondary-003-14": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1299", "title": "Hardware-Aware NAS for Keyword Spotting on nRF5340", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should a hardware-aware NAS pipeline constrain memory, latency, and search rewards for nRF5340 keyword spotting?", "chain_ids": ["tinyml-chain-auto-secondary-011-20"], "chain_positions": {"tinyml-chain-auto-secondary-011-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1300", "title": "SRAM-Constrained Layer Fusion on Cortex-M4", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Specify a memory management architecture to ensure peak memory stays under 256 KB without modifying model weights.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1301", "title": "Continuous KWS Power Specification on Corstone-300", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Corstone-300 continuous KWS system choose DVFS, duty cycling, and SRAM placement to stay below a 1.5 mW cap while meeting a 100 ms inference latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1302", "title": "ESP32-S3 Wake-Word Profiling Specification", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you profile wake-word latency spikes to distinguish vectorization failures, PSRAM cache thrashing, and I/O preemption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1303", "title": "Designing a Pruning Strategy for ESP32-S3 SRAM Constraints", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a pruning specification that achieves this memory target while ensuring the model fully utilizes the ESP32-S3's INT8 acceleration?", "chain_ids": ["tinyml-chain-auto-secondary-005-15"], "chain_positions": {"tinyml-chain-auto-secondary-005-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-005-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1304", "title": "Designing Quantization for Keyword Spotting on Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What INT8 quantization specification lets a 500 KB FP32 keyword spotter avoid FPU emulation and fit memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1305", "title": "Acoustic Pipeline Scheduling on Dual-Core MCU", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the 128 MHz app core and 64 MHz net core, how do you partition the pipeline to guarantee this deadline while respecting a ~5mA current limit?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 1}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1306", "title": "On-Device Privacy Guardrails for ESP32-S3 Voice Assistant", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a specification for an on-device, fail-safe guardrail system that prevents unauthorized audio transmission while maintaining acceptable latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1307", "title": "ESP32-S3 Roofline Memory Architecture Specification", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the system stage the 800 KB of weights so the arithmetic intensity remains compute-bound and satisfies the 50 ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1308", "title": "ESP32-S3 Continuous Audio Ingestion and Feature Extraction Design", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 audio pipeline buffer I2S data and compute MFCCs while preserving 300 KB SRAM for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1309", "title": "Designing a Static Tensor Arena for Wake-Word CNN", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the static memory allocation be designed to fit the tensor arena in SRAM and handle model parameters?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 1}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1310", "title": "Acoustic Pest Detection Duty Cycling on Corstone-300", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a duty-cycling strategy for the M7, Ethos-U55, and shared SRAM to achieve a 2-year battery life on a 220 mAh coin cell?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1311", "title": "Diagnosing CNN Performance on ESP32-S3 with Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should roofline analysis diagnose whether an ESP32-S3 keyword-spotting CNN with PSRAM-resident weights is memory-bound or compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1312", "title": "Roofline Analysis on Nordic nRF5340: Optimizing TinyML Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you use roofline analysis on an nRF5340 to classify a 100K-MAC, 60 KB-access KWS model as compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1313", "title": "Roofline Analysis for TinyML on ESP32-S3", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should roofline analysis account for ESP32-S3 SRAM, PSRAM, and accelerator utilization when diagnosing slow TinyML inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1314", "title": "Roofline Analysis for TinyML: Cortex-M7 + Ethos-U55 Performance Evaluation", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should roofline analysis predict whether a Cortex-M7 plus Ethos-U55 CNN workload is compute-bound or memory-bound, and predict NPU utilization?", "chain_ids": ["tinyml-chain-auto-secondary-013-25"], "chain_positions": {"tinyml-chain-auto-secondary-013-25": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1315", "title": "Roofline Analysis for TinyML on ARM Cortex-M4 STM32F4", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a roofline model tailored to an FPU-less STM32F4 diagnose a slow 1D CNN and guide optimization?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1316", "title": "Diagnosing TinyML Performance on Nordic nRF5340: Accelerator Trade-offs", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose nRF5340 keyword-spotting latency, brownouts, and battery drain when choosing CPU optimization or acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1317", "title": "TinyML Anomaly Detection: Cortex-M4 CPU vs. Custom ASIC for 8-bit CNN", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should a Cortex-M4 industrial anomaly detector run an INT8 CNN on CPU or offload critical layers to a custom ASIC to meet a 5ms budget?", "chain_ids": ["tinyml-chain-auto-secondary-012-16"], "chain_positions": {"tinyml-chain-auto-secondary-012-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1318", "title": "ESP32-S3 TinyML Accelerator Feasibility", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Does a 200K-MAC KWS model on ESP32-S3 need a custom accelerator, or can optimized CPU inference meet 50 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1319", "title": "TinyML Accelerator Trade-offs for Edge Anomaly Detection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which platform best meets the 10 Hz anomaly detection requirement while optimizing for long battery life?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1320", "title": "Accelerator Selection for Edge ML on ESP32-S3", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you compare the ESP32-S3 CPU, a minimal NPU, and a custom ASIC for small image classification under power limits?", "chain_ids": ["tinyml-chain-auto-secondary-012-15"], "chain_positions": {"tinyml-chain-auto-secondary-012-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1321", "title": "Optimizing a TinyML Model for Nordic nRF5340 Constraints", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you optimize an nRF5340 anomaly model to meet 100 Hz, sub-10 ms inference without an FPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1322", "title": "TinyML Compute Estimation for Nordic nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which compute and memory metrics must be estimated before deploying, and what is the theoretical latency floor for a 500k MAC model?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1323", "title": "Diagnosing High Inference Cost on nRF5340 for Keyword Spotting", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose why nRF5340 keyword-spotting inference takes 250 ms and battery life is far below target?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1324", "title": "TinyML Resource Estimation for Keyword Spotting on nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate MACs, peak RAM, latency, and quantization choices for nRF5340 keyword spotting?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1325", "title": "Edge AI Power Budget: Optimizing Gesture Recognition on ESP32-S3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate cloud training cost and year-long ESP32-S3 inference energy for a wearable gesture model?", "chain_ids": ["tinyml-chain-auto-secondary-004-05"], "chain_positions": {"tinyml-chain-auto-secondary-004-05": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1326", "title": "ESP32-S3 Memory Hierarchy: Capacity, Bandwidth, Latency Tradeoffs", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the memory tiers and how do their capacity, bandwidth, and latency affect TinyML placement?", "chain_ids": ["tinyml-chain-auto-026-02"], "chain_positions": {"tinyml-chain-auto-026-02": 0}, "chain_tiers": {"tinyml-chain-auto-026-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1327", "title": "TinyML Memory Latency Analysis on STM32F4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many CPU cycles remain for non-memory operations in the STM32F4 CNN critical section after SRAM access costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1328", "title": "ESP32-S3 Memory Bottleneck Diagnosis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the likely cause of the inference latency spikes when CPU utilization drops and PSRAM access requests increase?", "chain_ids": ["tinyml-chain-auto-026-02"], "chain_positions": {"tinyml-chain-auto-026-02": 2}, "chain_tiers": {"tinyml-chain-auto-026-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1329", "title": "TinyML Model Deployment: nRF5340 Memory Hierarchy Optimization", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you manage the model and data placement across the nRF5340's memory hierarchy to meet the real-time constraints, considering capacity, bandwidth, and latency tradeoffs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1330", "title": "TinyML Memory Hierarchy Optimization on STM32F4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which approach would you recommend for maximizing inference throughput while maintaining acceptable power consumption for a real-time application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1331", "title": "CNN Feature Map Optimization: SRAM vs. External DRAM on Cortex-M7/Ethos-U55", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose this memory bottleneck and quantify the potential performance improvement by moving it to SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1332", "title": "TinyML Memory Hierarchy Optimization on Nordic nRF5340", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 place 100 KB CNN weights, 50 KB activations, buffers, and code across flash, SRAM, and registers?", "chain_ids": ["tinyml-chain-auto-026-04"], "chain_positions": {"tinyml-chain-auto-026-04": 2}, "chain_tiers": {"tinyml-chain-auto-026-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1333", "title": "Optimizing CNN Memory Footprint on ARM Cortex-M4 for TinyML", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you analyze tensor arena layout and operator scheduling so a Cortex-M4 CNN fits within 256 KB SRAM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1334", "title": "Tensor Arena Planning for TinyML on Nordic nRF5340", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you plan a flat tensor arena and flash versus SRAM placement for a CNN on the Nordic nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1335", "title": "ESP32-S3 TinyML Inference Latency Due to Data Movement", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the most likely root cause(s) for the increased latency and propose a solution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1336", "title": "Optimizing Data Movement for Real-time TinyML on Cortex-M7 + Ethos-U55", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you leverage DMA, zero-copy techniques, and memory pinning to avoid CPU-bound data movement overheads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1337", "title": "Optimizing Sensor Data Ingress on nRF5340 for TinyML Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 move 16 KB sensor inputs from a high-speed peripheral to a TinyML model with minimal CPU overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1338", "title": "Optimizing Sensor Data Ingest for TinyML on ESP32-S3", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should ESP32-S3 DMA and buffering replace CPU copies for 200 KB sensor transfers in a 50 ms anomaly pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1339", "title": "Optimizing DMA and Data Movement for Real-time ML Inference on Cortex-M7 + Ethos-U55", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should DMA and ping-pong buffering raise a Cortex-M7 plus Ethos-U55 image pipeline from 10 FPS to 20 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1340", "title": "Optimizing Data Movement for TinyML Inference on Nordic nRF5340", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 pipeline I2S audio windows into an ML model and BLE output with minimal CPU copying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1341", "title": "Latency Decomposition for Keyword Spotting on ARM Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How you would systematically decompose, measure, and analyze each component on this resource-constrained embedded platform to meet the latency target?", "chain_ids": ["tinyml-chain-auto-secondary-003-09"], "chain_positions": {"tinyml-chain-auto-secondary-003-09": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1342", "title": "ESP32-S3 Edge Latency Decomposition for Real-time Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose the 700 ms latency and identify the components blocking the 300 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1343", "title": "Optimizing Real-time KWS Latency on Cortex-M7 + Ethos-U55", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose and optimize end-to-end KWS latency on a Cortex-M7 plus Ethos-U55 system under a 100 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1344", "title": "Latency Decomposition on Nordic nRF5340 for TinyML Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you measure and optimize TTFT, TPOT, BLE, preprocessing, and postprocessing latency on nRF5340 anomaly detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1345", "title": "TinyML Keyword Spotting Latency Decomposition on ARM Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose Cortex-M4 KWS latency across audio acquisition, preprocessing, inference, and postprocessing under 150 ms?", "chain_ids": ["tinyml-chain-auto-secondary-003-09"], "chain_positions": {"tinyml-chain-auto-secondary-003-09": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-09": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1346", "title": "Latency Decomposition for TinyML Keyword Spotting on Cortex-M7 + Ethos-U55", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you measure each component, identify bottlenecks, and ultimately recommend one model over the other based on this detailed latency analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1347", "title": "Real-time Anomaly Detection Latency on Nordic nRF5340", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can an nRF5340 wearable exceed a 250 ms anomaly-alert budget even when TinyML inference itself is fast?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1348", "title": "Latency Decomposition and Optimization for Real-time TinyML on ESP32-S3", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you profile and optimize ESP32-S3 KWS latency from audio capture through MQTT alert under a 100 ms target?", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1349", "title": "Real-Time TinyML Inference on Nordic nRF5340", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the architectural design guarantee a 33 ms real-time frame budget and prevent ANR timeouts on the nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1350", "title": "Real-Time TinyML Inference on ESP32-S3: Diagnosing Latency and Jank", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the system architecture be restructured to systematically diagnose and mitigate latency jitter to guarantee the 33ms real-time deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 5}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1351", "title": "Real-Time Gesture Recognition on Nordic nRF5340: Meeting 30ms Latency Budgets", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If the initial WCET analysis reveals that the inference frequently exceeds 30ms, what architectural, algorithmic, or software-level changes would you propose to bring it within budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1352", "title": "Optimizing ML Inference Latency on Cortex-M7/Ethos-U55 for Real-time Edge Applications", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically profile and identify the primary bottlenecks contributing to this 30 ms latency?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1353", "title": "Optimizing TinyML Inference Latency on STM32F4", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What specific profiling techniques and tools would you employ, considering the lack of an FPU and limited SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1354", "title": "Optimizing Real-time ML Inference on ESP32-S3: Diagnosing Latency Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose whether ESP32-S3 TinyML latency spikes come from compute, memory, or I/O bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1355", "title": "TinyML Latency Bottleneck on Cortex-M7+Ethos-U55", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify the specific latency bottlenecks (compute, memory, or I/O) and determine NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1356", "title": "Optimizing TinyML Latency on FPU-less Microcontrollers", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which bottlenecks keep an STM32F4 anomaly model at 150 ms, and which candidate architecture can meet the 80 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1357", "title": "Optimizing TinyML Latency on ESP32-S3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which ESP-IDF profiling metrics and tools should identify why ESP32-S3 KWS latency exceeds 150 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1358", "title": "Optimizing Latency on Cortex-M7 for TinyML Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you find and quantify the bottleneck preventing the Ethos-U55 audio classifier from meeting its 5 ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1359", "title": "Optimizing Latency for a TinyML Model on nRF5340", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Describe a detailed profiling methodology to identify whether the bottleneck is compute, memory access, or I/O, utilizing standard embedded development tools.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1360", "title": "INT8 Quantization Accuracy Drop on ARM Cortex-M4 STM32F4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might INT8 post-training quantization cause an unacceptable accuracy drop for a Cortex-M4 image classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1361", "title": "Quantization Drift on Nordic nRF5340 for Keyword Spotting", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this accuracy drop and propose a solution, considering the nRF5340's constraints?", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1362", "title": "Optimizing a Quantized TinyML Model for Resource-Constrained ARM Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you quantize a Cortex-M4 voice activity CNN to preserve accuracy while avoiding FPU emulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1363", "title": "Quantization Strategy for Edge Deployment on Cortex-M7 + Ethos-U55", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you improve INT8 quantization for a Cortex-M7 plus Ethos-U55 model with a 5% PTQ accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1364", "title": "Extreme Sub-4-bit Quantization for Keyword Spotting on ESP32-S3", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What strategies would you employ to recover the lost accuracy while maintaining the 2-bit memory footprint and respecting the ESP32-S3's hardware limitations?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1365", "title": "Extreme Quantization for TinyML Transformer on Cortex-M7/Ethos-U55", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you approach quantizing this model to sub-4-bit precision to fit within the 512KB SRAM limit and meet a 10ms inference latency requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1366", "title": "Extreme Quantization for Keyword Spotting", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the total flash memory footprint of the quantized model, and assuming linear scaling, what is the new SRAM requirement for activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1367", "title": "Diagnosing Sub-4-bit Quantization Accuracy Degradation on STM32F4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of sub-4-bit quantization accuracy degradation and memory faults on a standard microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1368", "title": "Sub-4-bit Quantization on Cortex-M7 + Ethos-U55 for TinyML", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you deploy a sub-4-bit keyword-spotting CNN on Cortex-M7 plus Ethos-U55 when an INT8 model exceeds SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1369", "title": "Extreme Quantization on nRF5340: Architecture Evaluation", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically evaluate and recommend the best model for the nRF5340, considering performance, resource constraints, and accuracy recovery feasibility?", "chain_ids": ["tinyml-chain-auto-secondary-009-07"], "chain_positions": {"tinyml-chain-auto-secondary-009-07": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1370", "title": "Extreme Quantization on Resource-Constrained ARM Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you size and evaluate ternary or 2-bit quantization for a Cortex-M4 gesture CNN targeting sub-10 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1371", "title": "Optimizing a TinyML LLM with Sub-4-bit Quantization on ESP32-S3", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose bottlenecks, propose a sub-4-bit strategy, quantify gains, and recover accuracy for a TinyML LLM on ESP32-S3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1372", "title": "Nordic nRF5340 Power Mode Analysis", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should DVFS P-states and power caps be managed on such a device to optimize energy efficiency, considering its constrained resources and variable workload?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1373", "title": "Energy-Efficient TinyML Inference on ARM Cortex-M4 STM32F4", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you analyze STM32F4 energy per inference using CMOS power, DVFS concepts, fixed-point CNNs, and memory access costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1374", "title": "Optimizing TinyML Power Budget on ESP32-S3 for Battery-Powered Edge Devices", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you approach the power budgeting of this system, considering the processor's DVFS P-states, memory access energy (SRAM vs. PSRAM), and TDP limitations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1375", "title": "TinyML Device: Unexpected Idle Power Drain", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the most likely root causes for this accelerated battery drain, and how would you systematically diagnose and rectify the problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1376", "title": "Optimizing ML Inference Power on Nordic nRF5340", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you use DVFS and energy-per-inference measurements to keep nRF5340 anomaly detection within a 100 uW power budget?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1377", "title": "Optimizing an Ultra-Low-Power Edge ML Model on Nordic nRF5340 for Energy Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you design a two-year coin-cell nRF5340 gesture model using Horowitz energy principles and energy-aware operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1378", "title": "Diagnosing Unexpected Energy Drain in TinyML on STM32F4", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can memory access dominate energy drain for an 8-bit CNN on an STM32F4 despite a low MAC count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1379", "title": "Optimizing CNN Energy on ARM Cortex-M4 STM32F4", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you cut CNN energy on an STM32F4 when SRAM access costs 10x a MAC and external flash costs 100x?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1380", "title": "Energy-Aware Inference Optimization on ESP32-S3 for TinyML", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you quantify potential energy savings using the Horowitz energy principles for memory access vs. arithmetic operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1382", "title": "Optimizing ESP32-S3 for Low-Power Acoustic Anomaly Detection with Solar and Coin Cell Power", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can an ESP32-S3 acoustic monitor meet a 1-year lifetime with 10-minute inference and CR2032 plus 2 V solar backup, and how should the power budget and duty-cycling strategy be designed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1383", "title": "Optimizing CNN for Real-time Anomaly Detection on Nordic nRF5340", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you redesign a CNN with depthwise separable convolutions and inverted residuals to meet latency and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1384", "title": "ESP32-S3 TinyML Deployment: Memory Feasibility Analysis for a Keyword Spotting CNN", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you assess whether a 300K-parameter INT8 KWS CNN with 200 KB activations fits on ESP32-S3 memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1385", "title": "TinyML Model Deployment on ARM Cortex-M4: Memory Constraint Analysis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can a Cortex-M4 KWS model with 180 KB weights and 90 KB activations fit and run effectively within 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1386", "title": "TinyML Model Feasibility on Nordic nRF5340 for Gesture Recognition", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the nRF5340's specifications, which model is more feasible to deploy given real-time memory and compute constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1387", "title": "TinyML Model Deployment on ARM Cortex-M4: Memory Constraints", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate weights, activations, buffers, and bottlenecks for a 100K-parameter INT8 CNN on an STM32F4?", "chain_ids": ["tinyml-chain-auto-secondary-003-14"], "chain_positions": {"tinyml-chain-auto-secondary-003-14": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1388", "title": "ESP32-S3 Model Deployment: Memory Footprint and Feasibility Analysis for a TinyML Vision Task", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you assess the deployment feasibility of a 500K-parameter CNN and diagnose potential memory bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1389", "title": "TinyML Model Feasibility", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Describe your systematic approach to estimate the model's parameter count, its total memory footprint after 8-bit integer quantization, and its approximate inference latency on a Cortex-M7 with Ethos-U55.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1390", "title": "Hardware-Aware NAS on ESP32-S3: Performance & Memory Trade-offs", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why would the hardware-aware NAS converge to a 400 KB SRAM, 50 MFLOP model with 150 ms latency?", "chain_ids": ["tinyml-chain-auto-secondary-011-21"], "chain_positions": {"tinyml-chain-auto-secondary-011-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1391", "title": "Hardware-Aware NAS for TinyML on Cortex-M7 + Ethos-U55", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should a hardware-aware NAS pipeline find an Ethos-U55 anomaly model under 80 ms latency and 350 KB memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1392", "title": "Hardware-aware NAS Deployment on Resource-Constrained Microcontrollers", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root causes of the NAS model deployment failure considering STM32F4 hardware limitations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1393", "title": "Hardware-Aware NAS on ESP32-S3 for TinyML Vision", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should an ESP32-S3 hardware-aware NAS constrain its search space and construct its reward function to ensure the model satisfies the SRAM and latency targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1394", "title": "Hardware-Aware NAS on Ethos-U55: Memory and Latency Constraints", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What concerns remain for an Ethos-U55 NAS candidate with 250K INT8 parameters and 500M MACs despite fitting the parameter memory?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1395", "title": "Hardware-Aware NAS on Nordic nRF5340: Memory and Latency Evaluation", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which nRF5340 NAS candidate is better at 10 Hz: 500K FLOP FP32 with 160 KB SRAM or 1.2M FLOP INT8 with 70 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-011-20"], "chain_positions": {"tinyml-chain-auto-secondary-011-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1396", "title": "Hardware-Aware NAS for Vision on ARM Cortex-M4", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should hardware-aware NAS for an FPU-less Cortex-M4 constrain search space, strategy, and feedback for vision models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1397", "title": "Hardware-aware NAS for Edge Deployment on Cortex-M7/Ethos-U55", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should Cortex-M7 and Ethos-U55 NAS incorporate SRAM, FLOPs, latency, and memory estimates before deployment?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 4}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1398", "title": "Pruning Strategies for nRF5340 Model Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the difference between structured and unstructured pruning, and which is better for nRF5340 edge deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1399", "title": "Pruning Strategies for Resource-Constrained Microcontrollers (ARM Cortex-M4)", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does structured pruning beat unstructured sparsity for a 1.5M-param CNN on a no-FPU STM32F4 with 256 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1400", "title": "Optimizing Keyword Spotting CNN for ESP32-S3 with Pruning & Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a comprehensive strategy to optimize this model using pruning and sparsity techniques to fit within memory constraints and meet performance targets?", "chain_ids": ["tinyml-chain-auto-secondary-005-15"], "chain_positions": {"tinyml-chain-auto-secondary-005-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-005-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1401", "title": "Ethos-U55 Pruning Impact: Latency & Memory for Object Detection", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you calculate the minimum achievable inference time, effective MACs per cycle, and freed SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1402", "title": "On-Device Model Pruning for STM32F4 with Sparsity and Performance Tradeoffs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the sparsity pattern be approached to maximize inference speedup on a Cortex-M4 without an FPU, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1403", "title": "Optimizing a CNN for Edge Deployment via Structured Pruning on Ethos-U55", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you prune an INT8 CNN for Ethos-U55 so it fits 512 KB SRAM while preserving dense NPU acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1404", "title": "Optimizing TinyML Inference on ARM Cortex-M4 via Pruning and Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should pruning and sparsity optimize a Cortex-M4 KWS model from 400 KB and 300 ms to 200 KB and 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1405", "title": "Diagnosing Knowledge Distillation Issues on ESP32-S3 for Anomaly Detection", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the engineering team systematically diagnose the distillation accuracy drop and CPU bottleneck on the ESP32-S3 architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1406", "title": "TinyML Knowledge Distillation for nRF5340 Anomaly Detection", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you distill a 5 MB teacher anomaly model into a student under 500 KB flash, 150 KB SRAM, and 50 ms?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1407", "title": "Knowledge Distillation vs. Pruning on ARM Cortex-M4 for Real-time Anomaly Detection", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you compare knowledge distillation and pruning for a real-time Cortex-M4 anomaly detector with no FPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1408", "title": "Optimizing Knowledge Distillation for Edge Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you train an MCU student model with knowledge distillation, quantization, and hardware-aware profiling, and why choose it over pruning?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1409", "title": "Optimizing TinyML Image Classification with Knowledge Distillation on Cortex-M7 + Ethos-U55", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose bottlenecks in the pruned model and propose a knowledge distillation strategy to recover accuracy on the Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1410", "title": "Optimizing a TinyML Model for Cortex-M7 with Ethos-U55: Graph Compilation Strategy", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a graph compiler convert and partition a TensorFlow CNN for Cortex-M7 plus Ethos-U55 real-time KWS?", "chain_ids": ["tinyml-chain-auto-secondary-004-33"], "chain_positions": {"tinyml-chain-auto-secondary-004-33": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1411", "title": "Optimizing a TinyML Model for ESP32-S3 with AOT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should AOT compilation, operator lowering, and constant folding help an ESP32-S3 KWS CNN fit SRAM and meet 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1412", "title": "nRF5340 KWS Inference Optimization: Graph Compilation Challenge", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the 500 ms baseline inference bottleneck and propose an AOT graph compilation strategy to achieve a 50 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1413", "title": "TinyML Graph Optimization for FPU-less ARM Cortex-M4", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should AOT graph compilation optimize an INT8 Cortex-M4 audio CNN from 50 ms toward sub-10 ms latency?", "chain_ids": ["tinyml-chain-auto-secondary-004-31"], "chain_positions": {"tinyml-chain-auto-secondary-004-31": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1414", "title": "Ethos-U55 Scheduling for Memory and Parallelism", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Ethos-U55 operator scheduling use memory reuse, parallelism, and layer fusion to reduce SRAM footprint and latency?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1415", "title": "Optimizing Operator Scheduling for TinyML on Nordic nRF5340", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are memory reuse, selective layer fusion, and realistic parallelism crucial when scheduling CNN operators on edge microcontrollers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1416", "title": "TinyML Operator Scheduling on ARM Cortex-M4 for Ultra-Low Latency Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you schedule, fuse, and reuse buffers for an INT8 Cortex-M4 CNN to minimize latency and SRAM footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1417", "title": "Diagnosing Inefficient Operator Scheduling on Cortex-M7/Ethos-U55 for TinyML", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does default scheduling leave the Ethos-U55 at 20% utilization with 40% of time spent on SRAM transfers?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1418", "title": "TinyML Operator Fusion on Cortex-M4 for Memory Constrained CNN", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should convolution, ReLU, and pooling be fused and scheduled to avoid intermediate activation OOM on a Cortex-M4 VAD CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1419", "title": "Optimizing Operator Scheduling for Memory and Performance on ESP32-S3", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which ESP32-S3 operator schedule best balances 512 KB SRAM reuse, dual-core parallelism, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1420", "title": "Ethos-U55 Operator Scheduling for Memory and Performance", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Cortex-M7 and Ethos-U55 operator scheduling minimize peak SRAM, maximize NPU utilization, and fuse CNN layers?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1421", "title": "TinyML Model Deployment on nRF5340: Optimizing Operator Schedule for Memory and Performance", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an operator scheduling strategy that allows the model to fit within the nRF5340's 256 KB SRAM constraints and meet the 100 ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1422", "title": "CNN Operator Scheduling on ARM Cortex-M4 with Limited SRAM", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an FPU-less Cortex-M4 schedule CNN operators with memory reuse, fixed-point arithmetic, and fusion under 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1423", "title": "Designing Robust OTA Updates for TinyML on Resource-Constrained STM32F4", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an STM32F4 OTA design use A/B partitions and rollback for an 80 KB TinyML model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1424", "title": "ESP32-S3 A/B OTA Flash Partition Sizing for ML Inference", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you size ESP32-S3 flash partitions for robust A/B OTA updates when the ML firmware image is 3 MB?", "chain_ids": ["tinyml-chain-auto-secondary-004-13"], "chain_positions": {"tinyml-chain-auto-secondary-004-13": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1427", "title": "Model Conversion and Deployment on ESP32-S3 with TFLite Micro", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the critical steps and memory placement strategies for converting a TensorFlow CNN to TFLite Micro on the ESP32-S3?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 0}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1428", "title": "Optimizing Model Deployment for Nordic nRF5340", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you convert, optimize, and deploy a TFLite TinyML model on nRF5340 while handling operator coverage gaps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1429", "title": "Optimizing and Deploying a Quantized Model on ARM Cortex-M4 STM32F4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you convert a PyTorch or ONNX KWS CNN to TFLite Micro for an FPU-less STM32F4 and fit 256 KB SRAM while maintaining acceptable latency?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 2}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1430", "title": "Optimizing Semantic Segmentation Deployment on Cortex-M7 + Ethos-U55 with Model Format Conversion", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you convert a PyTorch segmentation model for Cortex-M7 plus Ethos-U55 when unsupported operators threaten latency and memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1431", "title": "Optimizing ML Model Deployment on Nordic nRF5340 for Edge Inference", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you approach the conversion, operator handling, and memory optimization to successfully deploy this model within the nRF5340's constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1432", "title": "TinyML Model Deployment and Conversion Challenges on Resource-Constrained ARM Cortex-M4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you compare PyTorch and TensorFlow KWS models for STM32F4 deployment, conversion, and operator support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1433", "title": "Optimizing ONNX Model Conversion for ESP32-S3 Edge Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you handle ONNX-to-TFLite Micro operator gaps and memory placement for ESP32-S3 image classification?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 3}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1434", "title": "TinyML Model Conversion and Operator Gap Management for Nordic nRF5340 Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Outline a strategy for converting this 2.5 MB ONNX model to fit the nRF5340, detailing format conversion, operator gaps, and memory sizing.", "chain_ids": ["tinyml-chain-auto-027-04"], "chain_positions": {"tinyml-chain-auto-027-04": 2}, "chain_tiers": {"tinyml-chain-auto-027-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1435", "title": "Graceful Degradation for Real-time Anomaly Detection on Constrained TinyML Device", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Cortex-M4 anomaly detector degrade gracefully under noise, congestion, or battery stress while preserving utility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1436", "title": "ESP32-S3 Anomaly Detection with Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a system that employs graceful degradation to maintain operational status even under severe resource constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1437", "title": "TinyML Graceful Degradation for Predictive Maintenance on Cortex-M7/Ethos-U55", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose this performance degradation and design a graceful degradation ladder to maintain fail-operational status?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1438", "title": "Graceful Anomaly Detection on Constrained TinyML", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation strategy, considering degradation ladders, model fallbacks, fail-safe vs. fail-operational modes, and potential quality-of-service shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1439", "title": "Graceful Degradation Architectures for TinyML on ESP32-S3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which graceful degradation architecture is more suitable for critical ESP32-S3 anomaly detection, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1440", "title": "Graceful Degradation for Real-time TinyML Anomaly Detection", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Cortex-M7 plus Ethos-U55 anomaly detector use degradation ladders, fail-safe states, and QoS shedding under overload?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1441", "title": "TinyML Graceful Degradation on STM32F4 for Real-time Anomaly Detection", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an STM32F4 safety-critical anomaly detector degrade when sensor noise or data spikes break inference latency thresholds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1442", "title": "ASIL-B TinyML Determinism on ARM Cortex-M4 STM32F4", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do ASIL-B determinism, watchdogs, self-tests, and limited Cortex-M4 resources require WCET, interrupt, and memory discipline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1443", "title": "Functional Safety on ESP32-S3: Watchdog & ML Model Integrity for ASIL B", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an ISO 26262 ASIL B compliant functional safety mechanism on an ESP32-S3 for a TinyML model with a 30ms WCET and 100ms system deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1444", "title": "Functional Safety for TinyML on Resource-Constrained Automotive ECUs", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you ensure ISO 26262 ASIL-B functional safety for a Cortex-M4 brake-wear TinyML model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1445", "title": "ESP32-S3 TinyML Functional Safety & Determinism Optimization for ISO 26262", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose and reduce ESP32-S3 TinyML latency spikes that jeopardize ASIL-B deterministic execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1446", "title": "ASIL-C ML Safety on Cortex-M7 + Ethos-U55 for Automotive", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the strict requirements of ISO 26262 ASIL-C, how would you architect the system to ensure functional safety, particularly concerning the ML inference pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1447", "title": "Hardware Security for TinyML Model Integrity on Nordic nRF5340", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which hardware security feature protects TinyML firmware and model integrity against tampering or extraction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1448", "title": "Secure TinyML Anomaly Detection on ESP32-S3 Against Adversarial Attacks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 anomaly detector defend against model tampering, side-channel leakage, and poisoned OTA model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1449", "title": "SRAM Budgeting for On-Device Adversarial Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many additional 4096-element INT8 feature vectors per class fit in remaining SRAM for an on-device adversarial detector?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1450", "title": "Adversarial Attack on ESP32-S3 TinyML Classifier", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you analyze and defend an ESP32-S3 gesture classifier against adversarial sensor perturbations within resource limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1451", "title": "Adversarial Robustness Evaluation on Cortex-M7 + Ethos-U55 for TinyML", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an evaluation framework to compare resilience against common adversarial attacks given platform constraints?", "chain_ids": ["tinyml-chain-auto-secondary-011-16"], "chain_positions": {"tinyml-chain-auto-secondary-011-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1452", "title": "Optimizing Adversarial Defense on Resource-Constrained TinyML", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the system implement low-overhead adversarial defenses without breaking real-time latency deadlines on non-FPU hardware?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1453", "title": "Adversarial Robustness in ESP32-S3 Anomaly Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 anomaly detector mitigate adversarial sensor readings while balancing SRAM, PSRAM, and CPU overhead?", "chain_ids": ["tinyml-chain-auto-secondary-011-18"], "chain_positions": {"tinyml-chain-auto-secondary-011-18": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1454", "title": "Mitigating Dataset Bias in TinyML Gesture Recognition on Cortex-M7/Ethos-U55", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data selection or annotation workflow helps mitigate dataset bias for an Ethos-U55 gesture model under TinyML constraints?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 0}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1455", "title": "Optimizing TinyML Data Labeling for nRF5340 Resource Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the current active learning pipeline fail under BLE and memory limits, and how should data selection be optimized?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 1}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1456", "title": "TinyML Anomaly Detection: Data Curation for ARM Cortex-M4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you design a device-aware data curation and labeling pipeline for an FPU-less Cortex-M4 anomaly detector?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1458", "title": "Diagnosing Field Performance Degradation Due to Dataset Bias in TinyML", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this performance degradation from a dataset curation and labeling perspective, specifically considering the tinyML platform constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1459", "title": "nRF5340 TinyML Anomaly Detection Data Curation Pipeline", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should an nRF5340 data curation pipeline select, annotate, and transfer high-value anomaly samples under 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1460", "title": "TinyML Dataset Curation: Annotation Budget & SRAM Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How many 1-second audio snippets fit a 100-hour labeling budget, and how many 2 KB feature vectors could theoretically fit in the STM32F4's 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1461", "title": "Edge AI Keyword Spotting: Active Learning for Constrained Dataset Curation", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should active learning select rare-keyword audio samples for an Ethos-U55 KWS model while managing bias and SRAM limits?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 3}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1462", "title": "Optimizing TinyML Data Curation on nRF5340 for Active Learning", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and optimize this on-device data curation and transmission bottleneck to accelerate model improvement, and how would you quantify the impact of your proposed solution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1463", "title": "TinyML Anomaly Detection: Data Curation for Constrained Devices", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you curate and label vibration data so a Cortex-M4 anomaly model is representative, unbiased, and deployable?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 3}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1465", "title": "TinyML Stream Processing on STM32F4: Anomaly Detection Bottlenecks", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving from FFT to a small CNN break real-time STM32F4 vibration ingestion despite only 6 KB/s raw data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1466", "title": "Real-time RMS Feature Extraction on Cortex-M7 for TinyML Streaming", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many Cortex-M7 cycles per incoming 10 kHz sample are available for ingestion and sliding-window management?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1467", "title": "TinyML Streaming Anomaly on nRF5340", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely root cause of the data drops and CPU spikes before inference on the nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1469", "title": "Real-Time Anomaly Detection on TinyML: Cortex-M7 + Ethos-U55 Architecture Evaluation", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Cortex-M7 plus Ethos-U55 streaming architectures compare feature extraction versus raw-data NPU inference?", "chain_ids": ["tinyml-chain-auto-secondary-013-27"], "chain_positions": {"tinyml-chain-auto-secondary-013-27": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1470", "title": "Real-time Sensor Data Ingestion Optimization on ARM Cortex-M4 TinyML", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose and eliminate STM32F4 sensor data loss in a fixed-point streaming feature pipeline?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1471", "title": "Federated Learning on ESP32-S3: Convergence and Memory Challenges", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does ESP32-S3 federated learning of a 2 MB model suffer slow convergence and OOM errors, given its hardware constraints?", "chain_ids": ["tinyml-chain-auto-025-06"], "chain_positions": {"tinyml-chain-auto-025-06": 0}, "chain_tiers": {"tinyml-chain-auto-025-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1472", "title": "Federated TinyML for Anomaly Detection on Cortex-M7/Ethos-U55", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a federated architecture that minimizes communication overhead, handles non-IID data, and fits within 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1473", "title": "Federated Learning on Resource-Constrained BLE Devices", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Outline a concrete architecture and strategy to implement federated averaging given the strict 64KB RAM and BLE communication constraints.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1474", "title": "Optimizing Federated Averaging on Resource-Constrained Edge Devices", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should federated averaging on STM32F4 sensors reduce communication cost and on-device training energy for non-IID data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1475", "title": "Federated Learning on ESP32-S3: Scaling a TinyML Model for Non-IID Data", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should ESP32-S3 federated averaging handle non-IID data and LoRaWAN limits across 10,000 TinyML sensors?", "chain_ids": ["tinyml-chain-auto-025-06"], "chain_positions": {"tinyml-chain-auto-025-06": 2}, "chain_tiers": {"tinyml-chain-auto-025-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1476", "title": "Essential Responsible AI Documentation for ESP32-S3 TinyML Deployment", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which core piece of information, typically found in a model card or impact assessment, is absolutely essential to document for this system, even with limited on-device storage?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1477", "title": "Responsible AI on Constrained Edge Devices: Model Card & Guardrail Implementation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should responsible AI practices and on-device guardrails be implemented for an Ethos-U55 industrial anomaly detector?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1478", "title": "Responsible AI for TinyML: Edge System Governance on Cortex-M7 + Ethos-U55", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you address potential biases, ensure transparency, and handle model lifecycle management within this resource-limited environment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1479", "title": "TinyML Guardrails for Safety-Critical Systems on nRF5340", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What estimated resources would a heuristic guardrail consume, and what metrics must be added to the model card?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1480", "title": "Responsible AI Architecture Evaluation on Constrained TinyML Devices", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which architecture better serves responsible AI on an FPU-less Cortex-M4 anomaly detector: a quantized CNN or an interpretable tree ensemble?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1481", "title": "Responsible AI for Edge Safety: ESP32-S3 Anomaly Detection", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should model cards, impact assessments, red-teaming, and guardrails fit an ESP32-S3 safety-critical anomaly detector during the realization phase?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1484", "title": "Power Budget Allocation on Cortex-M7+Ethos-U55 IoT Node", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the current duty cycle meet the 7-day operational target?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1489", "title": "Roofline Feasibility Check for MobileNetV2 on Cortex-M4", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can MobileNetV2 fit on an STM32F4 with 1 MB flash and 256 KB SRAM, and what latency remains after shrinking it?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1490", "title": "Lightweight Attention Approximation on ESP32-S3", "topic": "attention-scaling", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does 4-head self-attention with seq_len 50 and d_model 64 fit in 512 KB SRAM on an ESP32-S3, and what alternative would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1491", "title": "Binary and Ternary Quantization Feasibility on Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Do binary or ternary weights provide a better tradeoff for a 100K-parameter model on a Cortex-M4 considering compute latency and accuracy risks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1492", "title": "PSRAM vs SRAM Latency Impact on Inference on ESP32-S3", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much latency does PSRAM add when an ESP32-S3 streams 500 KB of INT8 weights instead of using SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1494", "title": "Duty Cycling Strategy for Battery Life on Cortex-M4 Wearable", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate battery life and propose a duty cycling strategy for 7-day operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1496", "title": "Huffman Encoding for Weight Compression on Cortex-M4", "topic": "pruning-sparsity", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Estimate the compression ratio achievable with Huffman encoding and assess the runtime decompression cost.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1498", "title": "NAS Search Space Design for MCU-Deployable Model", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design an ESP32-S3 NAS search space under 100 KB SRAM, 500 KB flash, 80% accuracy, and 50 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1499", "title": "TinyML System Specification for Predictive Maintenance Sensor Node", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a predictive maintenance system for industrial motors using a sensor node with Cortex-M7+Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1500", "title": "MCU Sleep Mode Strategy for Always-On Wake Word Detection", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the average power of an ESP32-S3 two-stage wake-word detector with 10 fps prefilter and 5% full-model triggers?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1501", "title": "INT4 Weight Quantization Feasibility for Cortex-M4 Without DSP", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is INT4 worthwhile for a 200 KB INT8 model on a Cortex-M4 without DSP when the flash budget is 64 KB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1502", "title": "SRAM Tiling Strategy for Large Conv Layer on Cortex-M7+Ethos-U55", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is your colleague correct that the 28x28x96 Ethos-U55 CNN layer needs tiling within 512 KiB SRAM, and how would you tile a larger 56x56x96 input?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 1}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1503", "title": "MCU Power State Machine for Multi-Model ML Pipeline", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can an ESP32-S3 three-stage ML pipeline meet 1-year life on a 2000 mAh LiPo with a 5 mW always-on preprocessor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1517", "title": "Tensor Arena Sizing for MobileNetV2 on Cortex-M7", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is estimating a 2.1 MB SRAM requirement mathematically flawed, and what is the true arena size to fit a 96x96 INT8 MobileNetV2 within 512 KB of SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1518", "title": "Flash vs SRAM Weight Placement Strategy", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the performance penalty, and which layers should you prioritize copying to SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1519", "title": "Tensor Lifetime Analysis for U-Net on MCU", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you reduce peak memory below 512 KB without changing the model architecture?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 3}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1520", "title": "Scratch Buffer Allocation for Convolution", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For a 3x3 conv with 32 input channels processing a 24x24 feature map, how large is the im2col scratch buffer, and how does this affect your tensor arena budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1521", "title": "Arena Planning for Multi-Model Deployment", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 share arenas for sequential VAD, KWS, and IC models needing 15 KB, 85 KB, and 120 KB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1522", "title": "Memory-Optimal Operator Execution Order", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which execution order (A→B→C→D or A→B→D→C) has lower peak arena usage on a 512 KB Cortex-M7, and what is the general principle?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 4}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1523", "title": "Peak SRAM Optimization via Channel Splitting", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does channel splitting (computing 64 channels at a time) reduce the peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1524", "title": "PSRAM Spilling Strategy for ESP32-S3", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If you place the entire arena in PSRAM, how much slower is inference, and what hybrid SRAM/PSRAM placement minimizes the slowdown?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 1}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1525", "title": "Arena Fragmentation in Dynamic TinyML Workloads", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an arena management strategy that supports all 4 models without exceeding the strict 310 KB SRAM limit?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 2}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1526", "title": "Tensor Arena Planning for Streaming Audio on MCU", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can smaller ping-pong DMA buffers cut ESP32-S3 audio capture SRAM by 50% for 1-second, 16 kHz KWS windows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1527", "title": "Compile-Time Arena Verification", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a compile-time verification system that catches arena overflows before flashing firmware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1541", "title": "BLE Bandwidth Embedding Constraint", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the required embedding compression ratio to ensure a maximum 10ms transmission latency per embedding.", "visual": {"kind": "svg", "path": "tinyml-1541.svg", "alt": "Fanout diagram showing multiple Cortex-M4 sensor nodes transmitting data to a single central hub via BLE.", "caption": "Sensor Constellation Topology"}, "chain_ids": ["tinyml-chain-auto-secondary-017-50"], "chain_positions": {"tinyml-chain-auto-secondary-017-50": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-017-50": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1542", "title": "Cortex-M4 Energy-Harvesting Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose an optimal checkpointing interval that minimizes total training overhead while bounding maximum lost progress (RPO) to 1 minute.", "visual": {"kind": "svg", "path": "tinyml-1542.svg", "alt": "Timeline showing training progress interrupted by power failures, with checkpoints saving state periodically.", "caption": "Training Progress and Rollback"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1543", "title": "Hailo-8 Wake-up Energy Drain", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the power drain discrepancy by calculating the actual continuous average power consumption causing the 2-day battery death.", "visual": {"kind": "svg", "path": "tinyml-1543.svg", "alt": "Timeline showing sleep and wake power spikes, highlighting a raised sleep baseline power.", "caption": "Elevated Sleep Power Baseline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1545", "title": "Hailo-8 PCIe Frame Ingestion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you optimize the host-to-accelerator pipeline to prevent PCIe bottlenecking, computing raw bandwidth before and after INT8 cast?", "visual": {"kind": "svg", "path": "tinyml-1545.svg", "alt": "Diagram showing 4 camera streams merging into a host CPU, crossing a PCIe link to the Hailo-8 accelerator.", "caption": "PCIe Gen 3 x2 Bottleneck"}, "chain_ids": ["tinyml-chain-auto-secondary-017-51"], "chain_positions": {"tinyml-chain-auto-secondary-017-51": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-51": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1546", "title": "Cortex-M4 Audio Buffer Overflow", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the time elapsed from an empty buffer until the system drops its first audio packet?", "visual": {"kind": "svg", "path": "tinyml-1546.svg", "alt": "Linear growth chart showing the buffer filling up steadily over 5 seconds until it hits the 100-packet cap.", "caption": "Deterministic Buffer Growth"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1547", "title": "Decentralized Federate Tree Aggregation", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the theoretical lower-bound latency for a single bottom-up aggregation phase from leaf to root.", "visual": {"kind": "svg", "path": "tinyml-1547.svg", "alt": "A binary tree diagram showing 16 nodes with data flowing upwards from the leaves to a single root node.", "caption": "Binary Spanning Tree (Depth 4)"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1551", "title": "Cortex-M4 Vibration FFT Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the minimum required FFT window queue size to prevent data loss during a vibration burst.", "visual": {"kind": "svg", "path": "tinyml-1551.svg", "alt": "Chart showing queue building up to 1 during the 5 second burst, then draining to 0 during silence.", "caption": "Burst Queue Accumulation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1560", "title": "Raw ECG BLE Stream Viability", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the bandwidth requirements and determine if raw streaming is viable or if edge inference is strictly required.", "visual": {"kind": "svg", "path": "tinyml-1560.svg", "alt": "Data flow from patch to phone showing a bottleneck.", "caption": "BLE Bandwidth Bottleneck."}, "chain_ids": ["tinyml-chain-auto-secondary-017-50"], "chain_positions": {"tinyml-chain-auto-secondary-017-50": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-50": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1562", "title": "Hailo-8 PCIe Gen3 Saturated", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the realistic per-direction bandwidth and ingress utilization of a PCIe Gen3 x1 link streaming 1080p 60fps video?", "chain_ids": ["tinyml-chain-auto-secondary-017-51"], "chain_positions": {"tinyml-chain-auto-secondary-017-51": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-51": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1563", "title": "M4 Boot Energy Dominance", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the average power consumption over a 10-minute window when duty-cycling?", "visual": {"kind": "svg", "path": "tinyml-1563.svg", "alt": "Timeline showing 2s at 15mA, 1s at 20mA, and 597s at 0mA.", "caption": "Boot vs Inference Energy profile."}, "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 1}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1567", "title": "Flash-to-SRAM Paging Latency", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply memory-hierarchy principles to calculate the total data transfer time for paging a 160 KB model layer-by-layer.", "visual": {"kind": "svg", "path": "tinyml-1567.svg", "alt": "Bar chart comparing Flash read bandwidth with SRAM write bandwidth.", "caption": "Memory bus bottleneck."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1593", "title": "Cortex-M4 Sensor Queue Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the optimal queue depth and drop policy to handle a Poisson arrival rate of 50 Hz given a model execution time of 15 ms, to maintain 99% availability without overflowing a 2 KB SRAM buffer.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1594", "title": "MCU Audio Compute Bounding", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can a theoretical framework be formulated to estimate and compare the total MAC operations and cycle counts per 1-second audio window for both models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1597", "title": "Cascaded Wake-Up Architecture", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a cascaded wake-up duty-cycling architecture using a 1uW passive PIR sensor to trigger a fast 2ms INT8 cascade model before waking up the main 50ms classification model.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1600", "title": "RTOS Preemption for Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does replacing the polling loop with an RTOS task scheduler utilizing preemption fix the missing data issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1602", "title": "MCU Convolution Cycle Cost", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Implement an analytical model to calculate the exact cycle count of a 3x3 depthwise convolution considering MAC instructions and SRAM load/store overheads.", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1605", "title": "Task-Based FRAM Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What task-based checkpointing rule can write registers to FRAM before the final fully-connected layer, including the boundary condition and the recovery procedure on brownout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1606", "title": "A/B Bank Model OTA Updates", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does an A/B bank flash layout impact available program space and wear-leveling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1607", "title": "Evaluate minimal queue capacity for transient TinyML sensor bursts", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the minimum queue capacity required to ensure zero dropped events during a 1-second burst of 150 events?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1608", "title": "Identify TinyML memory regions for static weights and dynamic activations", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the correct memory regions where the weights and activation buffers must be placed to fit this model on the device.", "chain_ids": ["tinyml-chain-auto-secondary-016-05"], "chain_positions": {"tinyml-chain-auto-secondary-016-05": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-016-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1613", "title": "Calculate energy efficiency between slow and fast microcontroller clock modes", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate which MCU configuration consumes less total energy per inference.", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1622", "title": "Pipelining Compute and SPI Transfer Time", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total time to process and transmit 4 chunks if compute takes 10ms per chunk and transmission takes 5ms per chunk.", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1626", "title": "Constrained Memory Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How must the model's memory be laid out to successfully execute without out-of-memory errors?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1627", "title": "SRAM-Constrained Depthwise Execution", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can a computational schedule be designed to execute this layer entirely on-chip without requiring external DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1628", "title": "Peripheral DMA Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What standard hardware peripheral lets the CPU compute neural network layers while sensor data transfers occur in the background?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1629", "title": "Cortex-M4 Duty Cycle Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum active duty cycle percentage required to achieve a 100uW average power budget?", "visual": {"kind": "svg", "path": "tinyml-1629.svg", "alt": "Pie chart showing 99.4% sleep time and 0.6% active time", "caption": "Duty cycle breakdown for 100uW budget"}, "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 5}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1630", "title": "Hailo-8 Video Inference Queue", "topic": "queueing-theory", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the expected queueing delay for an incoming frame, and what strategy bounds average latency under 50ms?", "visual": {"kind": "svg", "path": "tinyml-1630.svg", "alt": "Hockey-stick curve showing latency exponentially rising as utilization approaches 1", "caption": "Latency vs Utilization Curve"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1631", "title": "BLE Mesh Bandwidth Collapse", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the bandwidth bottleneck, and can the current architecture support the data rate?", "visual": {"kind": "svg", "path": "tinyml-1631.svg", "alt": "8 nodes pointing to 1 central node", "caption": "8-to-1 Sensor Fanout Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1632", "title": "Solar Brownout Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the checkpointing strategy into NVRAM (FRAM) that minimizes write energy while ensuring the task completes across reboots?", "visual": {"kind": "svg", "path": "tinyml-1632.svg", "alt": "Timeline showing compute phase, checkpoint at 1s, brownout at 1.5s, and resumption", "caption": "Intermittent Power Execution Timeline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1634", "title": "Smart-Ag Flash Wear", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether a 5-minute or a 15-minute checkpoint interval minimises total expected energy across the 1-hour job, accounting for (a) the 80 ms flash erase+program at 50 mW per checkpoint and (b) the expected re-compute energy E[loss] = P_fail * mean_loss * compute_power. Identify by what factor the better interval wins and the regime in which the answer would flip.", "visual": {"kind": "svg", "path": "tinyml-1634.svg", "alt": "Line plot showing expected rollback penalty dropping as checkpoint frequency increases", "caption": "Rollback Penalty vs Checkpoint Interval"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1637", "title": "Energy Harvesting State Loss", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why the system makes zero progress and propose a non-volatile memory layout to guarantee eventual completion.", "visual": {"kind": "svg", "path": "tinyml-1637.svg", "alt": "Sawtooth timeline of execution progress dropping to 0 repeatedly", "caption": "Execution Progress without Checkpointing"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1639", "title": "LoRa Drone Swarm Consensus", "topic": "collective-communication", "competency_area": "parallelism", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a schedule that avoids packet collisions and compute the minimum consensus time.", "visual": {"kind": "svg", "path": "tinyml-1639.svg", "alt": "A star topology with Drone 0 as the central leader", "caption": "TDMA Star Consensus Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1643", "title": "UART WiFi Bottleneck Analysis", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the bandwidth mismatch and justify the required data reduction ratio.", "visual": {"kind": "svg", "path": "tinyml-1643.svg", "alt": "10 BLE nodes mapping to an MCU, bottlenecking at a UART connection to WiFi.", "caption": "Bandwidth funneling from BLE nodes through UART bottleneck."}, "chain_ids": ["tinyml-chain-auto-secondary-017-50"], "chain_positions": {"tinyml-chain-auto-secondary-017-50": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-50": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1644", "title": "Audio Event Queue Overflow", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the MCU buffer rapidly overflow when arrival rates exceed 95 events per second, and why is M/D/1 the correct model rather than M/M/1?", "visual": {"kind": "svg", "path": "tinyml-1644.svg", "alt": "Hockey-stick graph displaying expected buffer entries exploding as arrival rate approaches 100 events/sec.", "caption": "M/M/1 Queue Length vs Arrival Rate."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1646", "title": "MCU Weight Storage Hierarchy", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how this memory hierarchy strictly dictates the storage and execution strategy for the model's weights.", "visual": {"kind": "svg", "path": "tinyml-1646.svg", "alt": "Bar chart showing a 500KB model fitting in 1MB Flash but overflowing 256KB SRAM.", "caption": "Memory capacity vs model footprint."}, "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 1}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1652", "title": "BLE Mesh Federated Embedding Reduce", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total bytes transmitted per node for a 600-byte feature vector across the 3-node ring using the Ring AllReduce communication-cost formula, and why does Ring AllReduce win?", "visual": {"kind": "svg", "path": "tinyml-1652.svg", "alt": "Three node cyclic ring topology.", "caption": "Ring AllReduce logical topology for 3 nodes."}, "chain_ids": ["tinyml-chain-auto-secondary-016-20"], "chain_positions": {"tinyml-chain-auto-secondary-016-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-016-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1658", "title": "Cortex-M4 Quantization Memory Saturation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the root cause of the 5x latency degradation when migrating from 8-bit to 16-bit quantization on a Cortex-M4?", "visual": {"kind": "svg", "path": "tinyml-1658.svg", "alt": "Bar chart showing INT8 well under the 256KB threshold and INT16 drastically over it.", "caption": "SRAM Capacity vs Model Quantization Footprint."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1659", "title": "Cortex-M4 Seismic Duty Cycle", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption if the MCU runs inference exactly once per second.", "visual": {"kind": "svg", "path": "tinyml-1659.svg", "alt": "A sleep/wake timeline showing 15mW spikes every 1 second over a near-zero baseline.", "caption": "Duty cycle power spikes across a 3-second window."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1662", "title": "Cortex-M4 Cascade Wakeword", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the expected average power consumption of this cascade architecture?", "visual": {"kind": "svg", "path": "tinyml-1662.svg", "alt": "Sleep/wake timeline showing constant DSP baseline with occasional Neural Net spikes.", "caption": "Cascade architecture duty cycling."}, "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 0}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1664", "title": "Smart Doorbell Shedding Policy", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum tolerable arrival rate before the queue length goes to infinity, and how do you design a shedding policy if arrivals burst to 100 frames per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1665", "title": "Energy Harvesting Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the optimal layer-wise activation checkpointing strategy to non-volatile memory that maximizes forward progress while minimizing write overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1666", "title": "SPI/DMA Pipeline Overlap", "topic": "communication-computation-overlap", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can a dual-buffer DMA system overlap SPI sensor reads with DSP computations to ensure zero dropped samples?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 5}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1667", "title": "RTOS Priority Inversion Diagnosis", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the root cause of the dropped frames, and how can priority-inversion or scheduling fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1668", "title": "Frequency Scaling Duty Cycle", "topic": "duty-cycling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a duty-cycling schedule and optimize the clock frequency to minimize total energy consumed per hour.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1669", "title": "INT8 Cortex-M4 Footprint", "topic": "quantization-fundamentals", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the exact SRAM footprint reduction and qualitative execution impact of quantizing a 50K-parameter model to INT8 on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1670", "title": "Depthwise Convolutions on MCU", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the total MAC count compared to a standard convolution, and what are the minimal clock cycles required assuming 1 MAC/cycle?", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1673", "title": "Differential LoRaWAN Updates", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Analyze the full payload transmission time and design a differential weight update protocol to minimize OTA time.", "chain_ids": ["tinyml-chain-auto-secondary-016-05"], "chain_positions": {"tinyml-chain-auto-secondary-016-05": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-016-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1674", "title": "RTC State Machine Duty Cycle", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you implement the state machine timing to maximize battery life, calculating the exact active duty cycle percentage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1675", "title": "Asymmetric INT8 Calibration", "topic": "quantization-fundamentals", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-off between using asymmetric Min-Max quantization versus symmetric calibration for preserving activation fidelity.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1676", "title": "Microcontroller Cycles and SRAM for Inference Cost", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total instruction cycles required for a single forward pass, specifying the SRAM footprint required for intermediate activations.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1678", "title": "Depthwise Separable Convolution Execution Cycles on Microcontrollers", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why the actual inference latency on the Cortex-M4 only improved by 2.5x despite the 8x reduction in theoretical MAC operations.", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1681", "title": "Cycle-Accurate Execution Cost Model for 1D Convolutions", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an explicit cycle-accurate cost model for this 1D convolution and quantify the gap versus a naive 1.2x MAC heuristic.", "validated": false, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1682", "title": "Flash Memory Weight Placement for Bare-Metal Microcontrollers", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Specify how the microTVM compiler handles the model's weight initialization and memory layout in a bare-metal environment.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1688", "title": "TinyML Memory Paging Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can layer-wise memory overlay be implemented to execute this model within the SRAM limit?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 3}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1690", "title": "TinyML DMA Audio Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can Direct Memory Access (DMA) be applied to overlap continuous audio data acquisition with the CPU's neural network inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1694", "title": "Calibration Bias in PTQ", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the potential causes of this accuracy drop and identify which quantization step likely introduced the bias.", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1695", "title": "Wake-Word Event Queue Stability Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the maximum arrival rate this wake-word system can handle before the event queue diverges to infinity, and calculate the queue utilization if events arrive at 10 per second.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1696", "title": "Intermittent Power Checkpoint Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an evaluation metric for the checkpointing overhead and calculate the percentage of time spent checkpointing if state is saved after every 30ms of active compute.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1703", "title": "MCU Minimum Theoretical Inference Latency", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply these hardware constraints to calculate the absolute minimum latency for a single inference, ignoring memory limits.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1714", "title": "Recalling Little's Law Stability Conditions for Buffers", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the condition under which Little's Law (L = lambda * W) holds true for this buffer system.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1716", "title": "Throughput Impact of I/O Overlap on Microcontrollers", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the realized throughput gain from double-buffering, accounting for DMA bus contention, and identify whether the chip's single-bank SRAM bottlenecks the maximum overlap.", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 4}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1718", "title": "XIP Execution and SRAM Limits in Microcontrollers", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why copying weights to SRAM will cause an OOM error, and how the memory hierarchy should be mapped to succeed.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1719", "title": "DMA Overlap to Meet Hard Real-Time Deadlines", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a single-threaded execution model sustain the 100Hz rate, and what is the CPU idle percentage if DMA is used?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1720", "title": "Diagnosing Heap Exhaustion in TF Lite Micro", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the likely cause of this memory exhaustion, given that TFLM utilizes a static memory arena for model execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1721", "title": "Average Power Calculation in Duty-Cycled Sensors", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate average current including analog leakage, estimate the realistic battery lifetime accounting for the non-linear discharge curve, and identify the dominating loss term.", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 3}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1722", "title": "Missing Calibration in Post-Training Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What critical quantization step was likely omitted or executed with unrepresentative data?", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1723", "title": "Triple-Buffering Pipeline Throughput Constraint", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a true triple-buffer scheme and compute the strict SRAM footprint needed, comparing it against ping-pong buffering.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1724", "title": "Wake-up Penalty Reduction via Batching", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how batching 10 inferences every 500 ms affects power efficiency AND determine which retention mode wins under both the unbatched (50 ms) and batched (500 ms) duty cycles.", "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 2}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1725", "title": "Cortex-M4 Inference Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the sustained inferences per second assuming 50% MAC hardware utilization.", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1726", "title": "Static Quantization Parameters", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how post-training static quantization computes the specific integer parameters required to convert FP32 activations into INT8.", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1728", "title": "Camera Trap State Machine", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Describe the optimal hardware power state transitions to minimize energy per event.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1731", "title": "ARM SIMD DSP Extensions", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Recall the specific architectural instruction set feature used to execute multiple INT8 MACs per clock cycle.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1732", "title": "DMA Audio Pipelining", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does ping-pong buffering permit continuous I2S DMA, and what pointer-swap logic prevents dropped audio frames?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1733", "title": "OTA Memory Alignment Faults", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the memory layout and identify the low-level addressing error crashing the CPU.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1734", "title": "Evaluate Latency Bounds in Event-Driven Processing", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the MCU sustain this arrival rate without unbounded queue growth if a 2ms sleep-to-wake overhead is aggressively applied per event?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1735", "title": "Mitigating High Activation Memory in Deep Layers", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What memory management strategies can execute this specific bottleneck layer without exceeding the 256KB hardware SRAM limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1736", "title": "Design MAC Budget for Sub-50ms Wakeword Engine", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a theoretical compute budget in MACs for the neural network, assuming 4 MACs per SIMD instruction and 25% true pipeline utilization, to guarantee a sub-50ms inference time on an 80 MHz MCU.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1740", "title": "Analyze Checkpoint Overhead in Energy-Harvesting MCUs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does varying the checkpoint frequency impact the total expected inference time under an unstable power environment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1741", "title": "Diagnosing Quantization Collapse in Low-Amplitude Signals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how global symmetric quantization scales interact with high-variance input sensors, causing structural failure on subtle data patterns.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1742", "title": "Evaluate Sliding Window vs Quantized KV Cache", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the architectural trade-offs of using Sliding Window Attention versus INT8 KV Cache quantization to guarantee bounded SRAM usage without halting.", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1744", "title": "Analyze Execute-In-Place Model Switching Latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does configuring the memory protection unit to enable Execute-In-Place (XIP) memory-mapped architecture eliminate this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1745", "title": "Design a Sub-1mW Cascaded Acoustic Wake-up", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you design a multi-stage pipeline that guarantees the total average power stays below 1mW while processing sporadic acoustic events?", "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 4}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1747", "title": "INT8 Quantization Memory Reduction", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the exact SRAM memory savings in bytes when quantizing this activation map from FP32 to INT8.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1750", "title": "Dense Layer MAC Count", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total number of Multiply-Accumulate (MAC) operations required for a single forward pass of this layer.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1758", "title": "Evaluating INT16 Accumulation Trade-offs on Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the trade-offs of switching from INT32 to INT16 accumulation for intermediate dense layers to save memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1767", "title": "XIP Flash Memory for TinyML Models", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a memory management strategy to seamlessly switch models without requiring a full device reboot or adding external RAM.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1769", "title": "In-Place Depth-First Execution on Cortex-M4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply in-place depth-first execution to reduce the peak memory footprint below 64KB.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1770", "title": "Bare-Metal DMA Ring Buffers for TinyML", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a bare-metal inference loop leveraging DMA to feed the RNN without CPU polling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1773", "title": "Optimal Checkpointing for Intermittent Power", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Determine an optimal checkpointing frequency for the intermediate layer activations to ensure forward progress across power cycles.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1778", "title": "Analyzing Queue Capacity During Traffic Bursts", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the utilization of the MCU and predict if the system can handle a temporary burst of 10 events/sec for 2 seconds.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1779", "title": "Applying Loop Tiling to SRAM Limits", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which memory optimization technique can be applied to fit this layer's execution within the available 64KB limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1780", "title": "Evaluating OTA Payload Limits for Firmware", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the trade-offs of A/B partitioned firmware updates versus dynamically loadable neural network blob updates?", "chain_ids": ["tinyml-chain-auto-secondary-016-05"], "chain_positions": {"tinyml-chain-auto-secondary-016-05": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-016-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1782", "title": "Analyzing SRAM Placement for Low-Latency Audio", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the optimal placement of the 16KB audio ring buffer within the Cortex-M4's memory hierarchy to minimize power consumption.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1783", "title": "Analyzing Watchdog Timers for Simple Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the simplest fault-tolerance mechanism to recover from a brownout during the middle of the daily inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1788", "title": "SRAM vs Flash Layout", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the memory layout and determine how to execute the model without exceeding the SRAM limits.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1791", "title": "Intermittent Power Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify a non-volatile memory checkpointing routine that guarantees forward progress across brownouts.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1794", "title": "Microcontroller Event Power", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption of this duty-cycled system, assuming computation takes exactly 50ms per 2-second cycle.", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 0}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1796", "title": "Cortex-M4 SPI-Compute Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the percentage reduction in per-inference latency if DMA is configured to completely overlap the BLE transmission with the next inference computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1797", "title": "Cortex-M4 Flash vs SRAM execution", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the execution cycle difference between streaming directly from Flash for 5 passes versus copying the 100KB model to SRAM once?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 2}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1806", "title": "Power Loss Flash Write Margin Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Determine the safety margin in milliseconds remaining after the state is successfully flushed.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1808", "title": "Logical Ring on Physical Daisy-Chain", "topic": "collective-communication", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the synchronization stall time of a logical Ring AllReduce versus a bidirectional line-accumulator (Tree), accounting for a 10 microsecond SPI interrupt latency per step.", "chain_ids": ["tinyml-chain-auto-secondary-016-20"], "chain_positions": {"tinyml-chain-auto-secondary-016-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-016-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1809", "title": "RP2040 Core FIFO Spinlock", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the pipeline latency per audio frame if the FIFO spinlock synchronization costs 100us.", "chain_ids": ["tinyml-chain-auto-secondary-017-34"], "chain_positions": {"tinyml-chain-auto-secondary-017-34": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1810", "title": "AHB Bus Interrupt Barrier", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the pipeline throughput assuming an AHB bus interrupt barrier requires 50us to clear.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1811", "title": "ESP-NOW Link Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the resulting pipeline bubble given the ESP-NOW MAC protocol imposes a 1.5ms synchronization latency per packet.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1812", "title": "SPI Daisy Chain Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the end-to-end inference latency for a single sample, factoring in a 200us SPI DMA setup synchronization barrier at each hop.", "chain_ids": ["tinyml-chain-auto-secondary-017-35"], "chain_positions": {"tinyml-chain-auto-secondary-017-35": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1813", "title": "Dual-Core RP2040 Pipeline Bubble Fraction", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the pipeline compute bubble fraction for Core 1 during steady-state processing?", "chain_ids": ["tinyml-chain-auto-secondary-017-34"], "chain_positions": {"tinyml-chain-auto-secondary-017-34": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1814", "title": "Asymmetric Cortex-M Pipeline Synchronization Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total latency for the 4-microbatch pipeline, including the pipeline flush and barrier synchronization overheads.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1816", "title": "SPI Double-Buffering Pipeline Barrier Computation", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the maximum SPI payload size allowable to maintain the optimal 4ms pipeline throughput, assuming a 50us DMA setup barrier per transfer.", "chain_ids": ["tinyml-chain-auto-secondary-017-35"], "chain_positions": {"tinyml-chain-auto-secondary-017-35": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1817", "title": "Half-Duplex UART Ring AllReduce Diagnosis", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the root cause of the degraded latency by computing the theoretical lower bound for the Ring AllReduce operation on this topology.", "chain_ids": ["tinyml-chain-auto-secondary-016-20"], "chain_positions": {"tinyml-chain-auto-secondary-016-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-016-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1818", "title": "CAN Bus AllGather Broadcast Arbitration", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total time required for an AllGather operation between the two nodes, including the bus framing overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1819", "title": "Shared Medium RF Parameter Server Topology", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate which topology minimizes total transmission time over the shared RF medium, calculating the minimum time for the optimal choice.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1821", "title": "BLE 5.0 Star Topology Collective Sync", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the total latency for an AllReduce equivalent operation where both followers send 350 KB to the leader, and the leader broadcasts the averaged 350 KB model back?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1822", "title": "LoRa Mesh Duty Cycle Pipeline Impact", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the absolute minimum elapsed time for a single node to complete its AllReduce portion, factoring in the duty-cycle stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1823", "title": "Wi-Fi 6 CSMA Gossip Protocol Congestion", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the total expected time to reach 99% weight propagation across the network, accounting for the CSMA-enforced concurrency limit.", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}]
+[{"id": "cloud-0000", "title": "The Per-Token KV-Cache Cost", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much FP16 KV-cache memory does one added token consume for this 40-layer, 64-head, 128-dim-head model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 655 KB", "About 2.62 MB", "About 32.7 KB", "About 1.31 MB"], "correct_index": 3}}, {"id": "cloud-0001", "title": "The Continuous Batching Target", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following metrics is continuous batching designed to primarily improve?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 0}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time To First Token (TTFT)", "System Throughput / Average Time Per Output Token (TPOT)", "Model Loading Time", "GPU Idle Time"], "correct_index": 1}}, {"id": "cloud-0002", "title": "The KV-Cache Memory Hog", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate VRAM consumed by the KV cache for this single request?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 0}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.56 GiB", "~32 GiB", "~244 GiB", "~2.56 MiB"], "correct_index": 2}}, {"id": "cloud-0004", "title": "The VRAM Cost of Context", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you calculate the FP16 KV-cache VRAM for one 4,096-token request on this 80-layer, d_model=8192 model?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 1}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.25 GB", "5 GB", "10 GB", "80 GB"], "correct_index": 2}}, {"id": "cloud-0006", "title": "The Throughput Saturation Fallacy", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How long does it take one saturated node to serve 5,000 tokens at 3,000 tokens/s, and does user distribution change that?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Scenario A is slower due to multi-user overhead.", "Scenario B is slower because long generations are inefficient.", "Both scenarios take ~1.67 seconds to complete.", "Both scenarios take ~0.06 seconds to complete."], "correct_index": 2}}, {"id": "cloud-0007", "title": "The Network Tax: NVLink vs. InfiniBand", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a cross-rack data transfer using InfiniBand NDR compared to a GPU-to-GPU transfer within the same server using NVLink 4.0?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly the same latency (~1x)", "About 2x slower", "About 10x slower", "About 100x slower"], "correct_index": 2}}, {"id": "cloud-0008", "title": "The Speed of Light Constraint in RAG", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Ignoring compute, disk I/O, and queuing, what is the approximate minimum RTT for a single retrieval across the Atlantic?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~5 µs (Inter-rack trap)", "~1 ms (Regional zone trap)", "~40 ms", "~500 ms (Satellite trap)"], "correct_index": 2}}, {"id": "cloud-0009", "title": "The Blue/Green Memory Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak GPU memory is required to hold both old and new FP16 7B model weights during the blue/green rollout?", "chain_ids": ["cloud-chain-auto-001-02"], "chain_positions": {"cloud-chain-auto-001-02": 1}, "chain_tiers": {"cloud-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB. The new model replaces the old one, so you only need space for one.", "7 GB. A 7B model requires 7GB of memory.", "28 GB. Both the old and new 14 GB models must be in memory at the same time.", "56 GB. An FP16 model uses 4 bytes/param, and you need two of them."], "correct_index": 2}}, {"id": "cloud-0010", "title": "The Geographic Skew Tax", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much slower is a single read from the cross-country archive compared to the local SSD?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0011", "title": "The Llama 3 KV Cache Footprint", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you calculate the FP16 KV-cache VRAM for one 8,192-token Llama 3 8B request, and how does using KV heads versus query heads change it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4 GiB", "0.5 GiB", "1 GiB", "2 GiB"], "correct_index": 2}}, {"id": "cloud-0012", "title": "The True Cost of Batching on TTFT", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What maximum batch size keeps TTFT under 250ms with a 100ms batching window and 15ms prefill per request?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 requests. The worst-case queueing time must be subtracted from the SLO before calculating batch capacity.", "16 requests. The SLO of 250ms can be divided directly by the 15ms per-request time.", "6 requests. This assumes the queue time (100ms) and processing time (150ms) are independent.", "It's unlimited. The H100 is fast enough that prefill time is negligible."], "correct_index": 0}}, {"id": "cloud-0013", "title": "The TPOT Memory Wall", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary hardware bottleneck for Time Per Output Token (TPOT), and can the GPU theoretically meet this user's expectation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the GPU is compute-bound during generation and is limited to ~15 tokens/sec.", "No, FP16 memory reads (140GB) limit the GPU to ~24 tokens/sec, just barely meeting the demand.", "Yes, the GPU's memory bandwidth supports a theoretical speed of ~95 tokens/sec, so the issue is likely in the software.", "Yes, because the KV-cache makes all subsequent token generation instantaneous."], "correct_index": 2}}, {"id": "cloud-0014", "title": "The RAG Retrieval Step-Cost", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate latency you should expect for a single, random read from this SSD to retrieve a document chunk?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 ns", "~5,000 ns (5 µs)", "~100,000 ns (100 µs)", "~40,000,000 ns (40 ms)"], "correct_index": 2}}, {"id": "cloud-0016", "title": "The Skew from the Disk", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Based on the fundamental physics of a computer, what is a primary suspect for this drop in performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVMe SSD read is roughly 10x slower than the memory access, which is a minor source of error.", "A subtle floating-point precision difference between the Python training and C++ serving environments.", "The NVMe SSD read is over 300x slower than HBM memory access, likely causing data to be unavailable at inference time.", "The CPU clock speed is dynamically throttled lower during inference, affecting numerical stability."], "correct_index": 2}}, {"id": "cloud-0018", "title": "The Static Batching Penalty", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What minimum TTFT delay does static batching add for a single request to an idle server with batch size 8 and a 100ms timeout?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0ms, because the server is idle and can process the request immediately.", "~5ms, the approximate time to generate a single token.", "100ms, because the server waits for the batching timeout to expire.", "800ms, calculated by multiplying the batch size by the timeout."], "correct_index": 2}}, {"id": "cloud-0019", "title": "Continuous Batching and TPOT", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the effective TPOT for the new user in the batch?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 1}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100ms, because the 20ms step time is multiplied by the 5 users in the batch.", "4ms, because the 20ms step time is divided by the 5 users in the batch.", "20ms, because one token is generated for all users in a single step.", "33ms, because the system will throttle to match the SLO exactly."], "correct_index": 2}}, {"id": "cloud-0020", "title": "The 4x Data Cost Bug", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the impact of this OTA change on daily ingest volume for the 100-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~240 TB per day", "~48 TB per day", "~480 TB per day", "~4.8 TB per day"], "correct_index": 2}}, {"id": "cloud-0022", "title": "The Continuous Batching Queue", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is the service stable at 5 RPS with 128 generated tokens per request, and what is the average request time in the system?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["160 ms. The system is under capacity, so latency is simply the time it takes to service one request.", "200 ms. The latency is the service time plus a 20% overhead for being busy.", "800 ms. The system is 80% utilized, leading to significant queueing delay which quintuples the average latency.", "The system is unstable and will crash, because the required token rate is too close to the maximum."], "correct_index": 2}}, {"id": "cloud-0023", "title": "The Blue/Green Memory Squeeze", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a single 80 GB H100 support a blue/green deployment switching from a 15B to a 30B FP16 model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, the new 60 GB model fits within the 80 GB capacity.", "Yes, the total memory needed is only 45 GB (15 GB + 30 GB).", "No, the combined 90 GB footprint exceeds the 80 GB capacity.", "Yes, there is 50 GB free, and the new model is only 30 GB larger."], "correct_index": 2}}, {"id": "cloud-0024", "title": "The True Cost of an A/B Test", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the FP16 memory footprint difference between the 1B and 7B models, and what is the serving cost implication?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["24 GB. The experiment will have a higher memory cost due to using 4 bytes per parameter.", "6 GB. The memory increase is manageable as it only requires 1 byte per parameter.", "12 GB. The experimental model requires 7x more memory, significantly increasing the serving cost per user in the A/B test.", "14 GB. The experimental model needs 14 GB, which is the primary cost driver."], "correct_index": 3}}, {"id": "cloud-0025", "title": "The KV-Cache VRAM Budget (cloud-0025)", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can this request be handled by a single H100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~140 GB. It will not fit, as this is larger than the H100's 80GB VRAM.", "~84 GB. It will just fit, but leaves no room for the model weights.", "~335 GB. It will not fit, as this is over 4x the H100's 80GB VRAM.", "~42 GB. It will require at least half of an H100's 80GB VRAM for a single request's KV-cache alone."], "correct_index": 3}}, {"id": "cloud-0026", "title": "The Continuous Batching Dilemma", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What happens to TTFT and token throughput if the batching wait window increases from 20ms to 100ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Average wait time decreases by 10ms.", "Average wait time remains 20ms.", "Average wait time increases by 40ms.", "Average wait time increases by 100ms."], "correct_index": 2}}, {"id": "cloud-0027", "title": "The Little's Law Bottleneck", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What happens to queue wait time as arrivals approach the 100 req/s service rate with a 10ms first-token service time?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 1}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Wait time scales linearly, reaching 99ms at 99 req/s, staying just under the 100ms SLO.", "Wait time remains near 10ms (the service time) as long as arrival rate < 100 req/s.", "Wait time grows exponentially, hitting 200ms at 95 req/s and violating the 100ms SLO.", "Wait time is bound by the 10ms compute time, allowing 99 req/s safely."], "correct_index": 2}}, {"id": "cloud-0028", "title": "The Interconnect Latency Ladder", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of these communication links typically has the highest intrinsic latency, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink 4.0 Transfer (GPU-GPU, within server)", "PCIe Gen5 Transfer (CPU-GPU, within server)", "InfiniBand NDR Transfer (server-to-server)", "HBM3 Memory Access"], "correct_index": 2}}, {"id": "cloud-0031", "title": "The KV Cache Memory Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a 7B FP16 model with a 128k-token request on an 80 GB GPU, what component is the primary OOM driver?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model parameters (weights)", "Optimizer state (e.g., Adam)", "The KV Cache", "Intermediate activations for the final token"], "correct_index": 2}}, {"id": "cloud-0033", "title": "The Runaway KV-Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How large is the FP16 KV-cache for the 64,000-token request, and why does it OOM an 80 GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~335 GB. This far exceeds the available memory.", "~5 MB. This is negligible and shouldn't cause an OOM.", "~168 GB. The cache size far exceeds the remaining VRAM.", "35 GB. The memory is determined by the weights, so the error must be from fragmentation."], "correct_index": 0}}, {"id": "cloud-0035", "title": "The RAG Index Rollout", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If all 100 pods attempt to download the 10 GB file simultaneously from your artifact storage, how long would the download phase take?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.5 seconds", "0.2 seconds", "20 seconds", "0.025 seconds"], "correct_index": 2}}, {"id": "cloud-0036", "title": "The Head-of-Line Blocking Problem", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary latency-related problem that continuous batching is designed to solve compared to static batching?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 0}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It primarily increases the maximum theoretical throughput (tokens/sec) of the GPU.", "It solves head-of-line blocking, where short requests are stuck waiting for the longest request in a batch to complete.", "It reduces the VRAM required for the KV cache by using a different compression algorithm.", "It strictly processes requests in a first-in, first-out (FIFO) order to ensure fairness."], "correct_index": 1}}, {"id": "cloud-0041", "title": "The Batching Tipping Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the batch-1 arithmetic intensity, and how does increasing batch size shift the workload from memory-bound to compute-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The workload remains memory-bound because Arithmetic Intensity is constant; both compute and memory scale linearly.", "The workload is compute-bound at batch 1 and just becomes more compute-bound at batch 64.", "The workload shifts from memory-bound to compute-bound as the AI increases from ~33 to ~310 Ops/Byte.", "The workload becomes compute-bound, but its AI decreases because the memory grows faster than the compute."], "correct_index": 2}}, {"id": "cloud-0043", "title": "TTFT and TPOT for the 20th Output Token", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long until the user receives the 20th output token with 150ms prefill and 30ms TPOT?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["600ms", "180ms", "750ms", "780ms"], "correct_index": 2}}, {"id": "cloud-0045", "title": "The Iceberg of Inference Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the dominant, long-term cost for this 24/7 RAG system?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 0}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time cost of training the 7B model.", "The annual cost of storing the vector database embeddings.", "The annual cost of running the GPU for 24/7 inference.", "The network bandwidth costs for handling user queries."], "correct_index": 2}}, {"id": "cloud-0046", "title": "The KV-Cache Memory Bomb: VRAM Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory is required strictly for the KV-cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~5.3 GB (Calculated Trap: Forgets either the K/V pair multiplier or 2-byte FP16 multiplier)", "~2.6 GB (Calculated Trap: Forgets both the K/V multiplier and 2-byte FP16 multiplier)", "~80 GB (Calculated Trap: Confuses KV-cache size with total model weight footprint)", "~10.7 GB"], "correct_index": 3}}, {"id": "cloud-0047", "title": "The Static Batching Latency Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For a request arriving just after the 50ms static batching window opens, what components make up its worst-case TTFT?", "chain_ids": ["cloud-chain-auto-001-05"], "chain_positions": {"cloud-chain-auto-001-05": 0}, "chain_tiers": {"cloud-chain-auto-001-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["25ms", "50ms", "75ms", "100ms"], "correct_index": 2}}, {"id": "cloud-0048", "title": "The RAG Retrieval Bottleneck", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the most likely source of the RAG chatbot's high latency: LLM HBM access, network hop, or NVMe vector DB read?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reading documents from the NVMe SSD vector database", "LLM forward pass memory access to HBM", "Network transfer to the database server via InfiniBand", "L2 cache misses on the GPU during the forward pass"], "correct_index": 0}}, {"id": "cloud-0049", "title": "The Batching Window Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With a 200ms P99 TTFT SLO, 30ms network latency, and 50ms GPU time, what is the maximum queue wait for batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["150 ms", "170 ms", "120 ms", "200 ms"], "correct_index": 2}}, {"id": "cloud-0052", "title": "The Rollout Memory Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total additional HBM is required across 500 FP16 instances when upgrading from a 7B to a 13B model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3 TB", "13 TB", "6 TB", "12 TB"], "correct_index": 2}}, {"id": "cloud-0053", "title": "The Static Batching Waiting Game", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the average queue wait before processing for batch size 8 when requests arrive every 150ms?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 0}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100ms", "1050ms", "525ms", "800ms"], "correct_index": 2}}, {"id": "cloud-0055", "title": "The Concurrent User Limit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many concurrent users can one GPU support for a 13B FP16 model at 50% peak if each needs 64 tokens/s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 148 users", "About 297 users", "About 594 users", "About 19,019 users"], "correct_index": 1}}, {"id": "cloud-0056", "title": "The On-Node Interconnect Tax: NVLink vs. PCIe", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a data transfer over a standard PCIe Gen5 bus compared to a direct GPU-to-GPU transfer using NVLink 4.0?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About the same latency", "~10x slower", "~2x slower", "~18x slower"], "correct_index": 2}}, {"id": "cloud-0057", "title": "The FP16 Inference Memory Rule", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the minimum RAM or HBM needed just to load Llama-3-8B weights in standard FP16 precision?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 GB", "128 GB", "16 GB", "2 GB"], "correct_index": 2}}, {"id": "cloud-0058", "title": "The Continuous Batching Deadline: Batching Strategies", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the worst-case TTFT when a request arrives just after an iteration starts, and does it meet the 150ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40ms. The TTFT is determined solely by the prefill computation time.", "10ms. The TTFT is equivalent to the Time Per Output Token (TPOT).", "50ms. The TTFT is the prefill time plus the worst-case wait for the next iteration cycle.", "Up to several seconds. The request must wait for the longest sequence in the current batch to complete."], "correct_index": 2}}, {"id": "cloud-0059", "title": "The Basic Inference Memory Footprint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Before even considering the KV cache or the retrieval index, what is the minimum memory required just to load the model's weights for inference using FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~7 GB", "~112 GB", "~14 GB", "~2 GB"], "correct_index": 2}}, {"id": "cloud-0061", "title": "The RAG Update Bottleneck", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Assuming you have to ship the entire 200 GB file, how long would it take to transfer the index to a single serving pod over 400 Gbps InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["32 seconds", "2.7 minutes", "4 seconds", "0.5 seconds"], "correct_index": 2}}, {"id": "cloud-0062", "title": "The Real-Time Batching Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What maximum batch size fits a 100ms SLO for generating 50 tokens on a 7B model using 50% of H100 peak FP16 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["142", "5", "70", "23"], "correct_index": 2}}, {"id": "cloud-0063", "title": "The Blue-Green Memory Tax: Model Serving Infrastructure", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total FP16 HBM is consumed just by the 7B and 13B model weights during the blue-green transition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["26 GB", "20 GB", "40 GB", "80 GB"], "correct_index": 2}}, {"id": "cloud-0064", "title": "The Batching Window Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the worst-case TTFT with 150ms slotted batching and 80ms prefill, and does it satisfy the 200ms real-time requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["80 ms. The TTFT is simply the pre-fill computation time.", "150 ms. The TTFT is determined by the batching window, as it's the longest delay.", "230 ms. The worst case is the full batch window delay plus the pre-fill time.", "70 ms. The available time is the batch window minus the compute time."], "correct_index": 2}}, {"id": "cloud-0065", "title": "The Cold Start Penalty", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much slower is this initial read from the SSD compared to a subsequent read from HBM?", "chain_ids": ["cloud-chain-auto-001-02"], "chain_positions": {"cloud-chain-auto-001-02": 0}, "chain_tiers": {"cloud-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~30x slower", "~3x slower", "~330x slower", "~3,300x slower"], "correct_index": 2}}, {"id": "cloud-0066", "title": "The Canary Memory Footprint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much inference memory does the 70B FP16 model need, and can it fit on a single H100 GPU?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 0}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model requires 70 GB, so it will fit on the 80 GB H100.", "The model requires 140 GB; you just need to update the pod's memory request in Kubernetes.", "The model requires 140 GB, which exceeds the H100's 80 GB. A multi-GPU strategy is now required.", "The model requires over 1.1 TB to store optimizer states, making it impossible to serve."], "correct_index": 2}}, {"id": "cloud-0067", "title": "The Static Batching Throughput Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the largest static batch size that meets the 200ms TTFT SLA, and what token throughput does the 10ms decode step provide?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Max Batch: 20, Throughput: 2000 tokens/sec", "Max Batch: 13, Throughput: 1300 tokens/sec", "Max Batch: 10, Throughput: 1000 tokens/sec", "Max Batch: 10, Throughput: 100 tokens/sec"], "correct_index": 2}}, {"id": "cloud-0068", "title": "The On-Node Interconnect Ladder", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which interconnects most likely correspond to these two latencies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ~500ns transfer is via L2 Cache (calculating ~1.5ns); the ~1,000ns transfer is via NVLink 4.0.", "The ~500ns transfer is via NVMe SSD read (calculating ~10,000ns); the ~1,000ns transfer is via PCIe Gen5.", "The ~500ns transfer is via NVLink 4.0; the ~1,000ns transfer is via PCIe Gen5.", "The ~500ns transfer is via InfiniBand NDR (calculating ~2,000ns); the ~1,000ns transfer is via NVLink 4.0."], "correct_index": 2}}, {"id": "cloud-0069", "title": "The RAG Retrieval Tax", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate latency for a single random read from an NVMe SSD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 ns (HBM Memory Access)", "~5 \\u00b5s (Cross-Rack InfiniBand)", "~100 \\u00b5s (NVMe SSD Read)", "~40 ms (Cross-Country Fiber)"], "correct_index": 2}}, {"id": "cloud-0072", "title": "The Inference Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For basic inference using half-precision (FP16), how much GPU memory should you budget just to load the model weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "28 GB", "112 GB", "14 GB"], "correct_index": 3}}, {"id": "cloud-0075", "title": "The Static Batching Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the user's Time-To-First-Token (TTFT)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10ms", "40ms", "50ms", "30ms"], "correct_index": 2}}, {"id": "cloud-0076", "title": "The On-Node vs. Off-Node Divide", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which is faster for a small payload, same-server GPUs over NVLink or different servers over InfiniBand, and by roughly what factor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly the same speed.", "InfiniBand is ~2x faster.", "NVLink is ~10x faster.", "NVLink is ~100x faster."], "correct_index": 2}}, {"id": "cloud-0077", "title": "The FP16 Inference Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To serve this model for inference, approximately how much GPU memory is required to hold just the model weights in standard half-precision (FP16)?", "chain_ids": ["cloud-chain-auto-016-02"], "chain_positions": {"cloud-chain-auto-016-02": 0}, "chain_tiers": {"cloud-chain-auto-016-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "14 GB", "28 GB", "112 GB"], "correct_index": 1}}, {"id": "cloud-0079", "title": "The On-Node vs. Off-Node Latency Chasm", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much slower is a cross-rack InfiniBand NDR transfer compared to an on-node NVLink 4.0 transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand is about the same latency as NVLink.", "InfiniBand is about 100x slower than NVLink.", "InfiniBand is about 10x slower than NVLink.", "InfiniBand is about 2x faster than NVLink."], "correct_index": 2}}, {"id": "cloud-0080", "title": "The 7B Model Memory Footprint: Model Serving Infrastructure", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What absolute minimum memory is required to load the 7B model weights for FP16 inference before orchestration or KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["112 GB", "7 GB", "14 GB", "28 GB"], "correct_index": 2}}, {"id": "cloud-0081", "title": "The Canary Rollout Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What minimum VRAM is required to load the new 13B model for FP16 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["13 GB", "52 GB", "26 GB", "208 GB"], "correct_index": 2}}, {"id": "cloud-0082", "title": "The Continuous Batching Throughput Limit", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Using Little's Law, how would you estimate peak requests/sec and tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~66 QPS", "~1.7 QPS", "~16.5 QPS", "~2,112 QPS"], "correct_index": 2}}, {"id": "cloud-0084", "title": "The Static Batching Timeout Trap", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the TTFT components for this idle static-batching case, and what best-case TTFT will the user see?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5ms. (Calculated Trap: Only decoding time)", "50ms. (Calculated Trap: Only prefill time, missing timeout)", "250ms.", "255ms. (Calculated Trap: Prefill + decoding + missing timeout)"], "correct_index": 2}}, {"id": "cloud-0086", "title": "The Blue/Green Capacity Trap", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many H100 GPUs must you provision at peak for a blue-green rollout of this 10-replica deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 GPUs. The new model replaces the old one, so the same number of GPUs is sufficient.", "11 GPUs. You just need one extra GPU to start the rollout, and the orchestrator will handle the rest.", "20 GPUs. The entire new 'green' deployment must run in parallel with the old 'blue' deployment before traffic is switched.", "19 GPUs. The new model is roughly twice as large (13B/7B), so you need about twice the GPUs, but you can reuse one from the old fleet."], "correct_index": 2}}, {"id": "cloud-0087", "title": "The Real-Time Dilemma: TTFT vs. Throughput", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With an 80ms batching window and 40ms prefill, what worst-case TTFT should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40ms", "80ms", "120ms", "200ms"], "correct_index": 2}}, {"id": "cloud-0088", "title": "The On-Node vs. Cross-Node Latency Jump", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a cross-rack InfiniBand NDR transfer compared to a transfer between two GPUs on the same server using NVLink 4.0?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 0}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x slower", "~10x slower", "~100x slower", "~1000x slower"], "correct_index": 1}}, {"id": "cloud-0089", "title": "The RAG Latency Trap: Compound AI Systems", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of these two stages is the dominant source of latency in the RAG pipeline?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 0}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The vector database lookup, because disk I/O is fundamentally slower than on-chip computation.", "The LLM generation, because it is an intensely memory-bandwidth-bound operation.", "They are roughly equal, with compute and I/O taking about the same amount of time.", "The network transfer between the database and the LLM server."], "correct_index": 1}}, {"id": "cloud-0092", "title": "The Real-Time Translation Bottleneck", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To understand the system's limits, what is the theoretical minimum baseline Time Per Output Token (TPOT) for a single, isolated user?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Less than 1 ms. The operation is compute-bound, limited by the H100's PetaFLOP-scale compute.", "Approximately 95 ms. The 3.35 TB/s bandwidth is measured in Terabits, not TeraBytes, reducing effective bandwidth.", "Approximately 12 ms. The operation is memory-bound by the time it takes to read 40 GB of weights over the 3.35 TB/s HBM interface.", "Around 300 ns. This is the fundamental latency of a single HBM3 memory access."], "correct_index": 2}}, {"id": "cloud-0093", "title": "The Serving Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What fundamental priority did we fail to invert?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0094", "title": "The P99 Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the P99 TTFT spikes above 500ms despite a busy GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0095", "title": "The Throughput Trap", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why can a high-throughput static-batched server still feel laggy when TPOT is ~12ms but TTFT is high?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0096", "title": "The Memory Wall of Long Contexts", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why can a few 16k-token conversations cause an OOM on an H100 system that comfortably handles 64 short-prompt users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0098", "title": "The Interactive API Latency Spike", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of the P99 TTFT exceeding 500ms with static batching and spiky GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0099", "title": "The Code-Gen Throughput Ceiling", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What approximate throughput improvement can continuous batching provide for output lengths [20, 50, 80, 100, 150, 200, 300, 1000]?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0100", "title": "The Unstable Translation Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can the FIFO single-worker system meet the 200ms TTFT deadline at 10 requests/sec, and why?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 2}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0101", "title": "The Continuous Batching OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much KV-cache capacity remains after loading the 70B INT8 weights, and why can long contexts OOM the GPU?", "chain_ids": ["cloud-chain-auto-008-18"], "chain_positions": {"cloud-chain-auto-008-18": 0}, "chain_tiers": {"cloud-chain-auto-008-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0102", "title": "TTFT Lower Bound for a 70B Model on H100s", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is a <200ms TTFT mathematically feasible for a 70B model, and what is the primary hardware bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0103", "title": "The High-Throughput API Crisis", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this poor performance despite the GPU being fully utilized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0104", "title": "The Real-Time Ad Bidding SLA", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the current single-server ad bidding system meet the 99.9% under 50ms SLA with 5% of requests taking 80ms, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0107", "title": "The Continuous Batching Hiccup", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely design flaw in your continuous batching scheduler causing these latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0108", "title": "The Live Caption Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Based on an M/M/c queue, is there a finite number of H100 servers (c) that can keep 99% of 10-second transcription chunks under 500ms at 3 RPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0109", "title": "The TPOT Trade-off", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much computation does static batching waste by padding 64 requests to 512 tokens, and what throughput gain should continuous batching give?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 2}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0110", "title": "The Algorithmic Trading Deadline", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely bottleneck causing the missed deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0111", "title": "The Chatbot Throughput Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using queueing theory, what is the fundamental state of this system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is memory-bound due to the KV cache size for 16 users.", "The system is operating in a stable but highly-loaded state (\\rho \\approx 99%).", "The network is saturated from streaming back responses for 16 users simultaneously.", "The system is unstable (\\rho > 1) because the arrival rate is higher than the service rate."], "correct_index": 3}}, {"id": "cloud-0112", "title": "The Prefill-Decode Collision", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why would a short prompt be forced to wait so long in a continuous batching system?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 0}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The KV cache ran out of memory, forcing an eviction/recomputation cycle.", "The continuous batching scheduler's overhead becomes too high with long sequences.", "The long-context prefill is a monolithic, non-preemptive operation that blocks new requests.", "The inter-token latency (TPOT) for the long-context request slowed down the whole system."], "correct_index": 2}}, {"id": "cloud-0113", "title": "The Static Batching TTFT Penalty", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is TTFT so high for ~100-token prompts when statically batched to 4096 tokens on an H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The network latency between the web server and the H100 is too high.", "The 7B parameter model is too large, causing slow weight loading from VRAM.", "Static batching forces every short prompt to pay the full computational cost of the 4096-token context window.", "The batch size is too small, which underutilizes the H100's Tensor Cores."], "correct_index": 2}}, {"id": "cloud-0114", "title": "The Throughput Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If VRAM is not the issue, what queueing effect explains why batch size 64 makes P99 latency skyrocket and effective throughput decrease?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU has hit a Memory-Capacity-Bound limit, running out of VRAM for the batch of 64.", "The system is Queue-Bound; utilization (rho) exceeds 1.0, causing head-of-line blocking and wait times to explode.", "The H100 is Bandwidth-Bound; memory bandwidth saturation strictly limits throughput to 2133 req/s.", "The CPU is Single-Thread-Bound, restricting the batch preparation rate to 50% of the GPU's capacity."], "correct_index": 1}}, {"id": "cloud-0115", "title": "The 60% Utilization Mystery", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely reason for this phenomenon?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU is memory-bandwidth bound, so compute cores are idle waiting for HBM. (Trap: assumes HBM limits utilization metric, but nvidia-smi util is time-based, not bandwidth-based)", "The Python GIL in the server is preventing the scheduler from running in parallel with GPU execution. (Trap: GIL affects threading, but continuous batching runs mostly in C++ backend)", "The request arrival rate is not high enough to fully saturate the server's capacity. (Trap: contradicts \"queue is full\" in scenario)", "The GPU is frequently idle, waiting for the CPU-bound scheduler to manage requests and memory between batches."], "correct_index": 3}}, {"id": "cloud-0116", "title": "The Reinforcement Learning Latency Stall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely bottleneck when sending 500,000 tiny CPU observation tensors per second to one GPU over PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 ms wasted (Bandwidth-Bound)", "50 ms wasted (NVLink-Bound)", "500 ms wasted (Transaction-Overhead-Bound)", "5000 ms wasted (Memory-Bound)"], "correct_index": 2}}, {"id": "cloud-0117", "title": "The Translation API Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What batching behavior most likely causes the 1.5s P99 TTFT despite good TPOT in the H100 translation API?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100 compute is underutilized; switching to a smaller, cheaper GPU would be more efficient.", "The network latency between the load balancer and the inference servers must be spiking to over 1 second.", "The 1000ms batching timeout is too high for the low arrival rate, causing requests to wait too long in the queue before processing.", "The model is too large, causing slow cold starts during token generation, which increases the time to the first token."], "correct_index": 2}}, {"id": "cloud-0118", "title": "The P99 Latency Explosion", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely queueing cause of P99 TTFT exceeding 900ms with 20 RPS and static batching of 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The batch size is too small, resulting in inefficient, low-throughput GPU kernels.", "The H100 GPU is not powerful enough to handle 20 RPS, causing a persistent backlog of requests.", "Head-of-line blocking from the static batching strategy is creating extreme queueing delays for some requests.", "The network connection to the NVMe drives used for swapping KV-cache is saturated."], "correct_index": 2}}, {"id": "cloud-0120", "title": "The Chatbot's Awkward Silence", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is causing P99 TTFT over 800ms despite high GPU utilization and excellent TPOT with a 500ms static batching window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 70B model's prefill computation is too slow for the H100 GPU, creating a compute bottleneck.", "The system needs more H100 GPUs to handle the request volume and reduce queueing delays.", "The static batching window (500ms) forces requests to wait artificially, which is the primary contributor to TTFT.", "Network latency between the user and the datacenter is the most likely cause for the >800ms delay."], "correct_index": 2}}, {"id": "cloud-0121", "title": "The Unstable Chatbot Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely root cause of excessive TTFT with 10 RPS, a 400ms static batching timeout, batch size 4, and 450ms prefill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 450ms processing time is too slow for the GPU, indicating a compute bottleneck.", "The 400ms batching timeout is too short, preventing the formation of larger, more efficient batches.", "The arrival rate (10 RPS) exceeds the system's maximum service rate (~8.88 RPS), causing an unstable and ever-growing request queue.", "Network latency between the user and the datacenter is the primary contributor to the 800ms+ TTFT."], "correct_index": 2}}, {"id": "cloud-0122", "title": "The Chatbot Timeout Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary cause of the 800ms P99 TTFT with static batching of 32 and 450ms full-batch forward passes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU is not powerful enough. We should upgrade to reduce the batch processing time from 450ms.", "The static batching policy is causing head-of-line blocking. We should implement continuous batching to eliminate queuing delay.", "The 13B model is too large. We should quantize the model to INT8 to decrease the per-batch inference time.", "The issue is inefficient token generation. We should implement speculative decoding to improve Time Per Output Token (TPOT)."], "correct_index": 1}}, {"id": "cloud-0123", "title": "The Chatbot's Silent Wait", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is most likely causing the 800ms P99 TTFT when the GPU is only 60% utilized and static batching uses a 200ms timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100 GPU is not powerful enough for this load. We should upgrade to a B200 to reduce the per-batch compute time.", "The model's prefill computation is the bottleneck. We should apply INT8 quantization to reduce the TFLOPs required.", "The fixed 200ms batching timeout is causing excessive queueing delay; requests wait idly instead of being processed.", "The bottleneck is network I/O from fetching user data for each request, causing the serving process to block before batching."], "correct_index": 2}}, {"id": "cloud-0124", "title": "Static Batching Throughput Limit for Translation Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is TTFT P99 climbing steadily past 2 seconds at 12 RPS despite only 65% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The static batch size is too large, increasing per-batch latency. Reducing it to 8 would lower TTFT.", "The H100 GPU is underpowered for this model. GPU utilization would be 100% if it were the bottleneck.", "The system is overloaded because its maximum throughput is lower than the arrival rate, causing the request queue to grow. Static batching is artificially depressing throughput.", "The 25ms prefill latency is the primary bottleneck. Optimizing the data input path is the highest priority."], "correct_index": 2}}, {"id": "cloud-0125", "title": "The Chatbot Lag Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What explains the huge gap between 150ms P50 TTFT and 2.5s P99 TTFT with static batching of 64 at 95% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The prefill computation is stalling at 2.15s due to unoptimized attention kernels. (Mistaking queue time for compute time).", "The system is experiencing severe head-of-line blocking due to its static batching policy, causing 2.0s queue delays.", "The GPUs are throttling, causing a 16x degradation from 150ms to 2500ms. (Ignoring the static batching pipeline).", "Network I/O is saturated at the ingress, dropping the request packets. (Misdiagnosing system-level queueing as network congestion)."], "correct_index": 1}}, {"id": "cloud-0126", "title": "The Chatbot SLO Catastrophe", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the core problem causing the 500ms P99 TTFT SLO breach in this LLM chat service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100s are underpowered for this workload; the high utilization proves they can't keep up with the request volume.", "The InfiniBand network latency is adding too much overhead, causing requests to miss their deadline.", "Head-of-line blocking from static batching is causing massive queueing delays, and the average request wait time is 10x the SLO.", "GPU utilization is too high, leading to thermal throttling. We should reduce the batch size to give the GPU recovery time."], "correct_index": 2}}, {"id": "cloud-0128", "title": "The Translation API's Latency Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What causes the 250ms TTFT SLA miss when static batching adds a 200ms window on top of 150ms prefill time?", "chain_ids": ["cloud-chain-auto-001-05"], "chain_positions": {"cloud-chain-auto-001-05": 1}, "chain_tiers": {"cloud-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The TTFT is 150ms + 50 tokens * 20ms = 1150ms, which means the model decode speed is the bottleneck.", "The system is experiencing head-of-line blocking. A request can wait in the queue for up to 200ms before processing even begins, pushing the total TTFT to 350ms.", "The request arrival rate is too high, overwhelming the system. The service needs more GPU replicas to handle the load.", "The TTFT is exactly 150ms since it does not include queue time, indicating a network latency issue."], "correct_index": 1}}, {"id": "cloud-0130", "title": "The Chatbot Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling issue explains low 40% GPU utilization and P99 TTFT over 200ms when average TTFT is only 30ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100's memory bandwidth is insufficient for the 7B model, making the memory-bound decode step (TPOT) the bottleneck.", "The prefill computation for processing the input prompt is too slow, making the service compute-bound on the GPU.", "The static batching timeout creates head-of-line blocking and inefficient small batches, leading to high queueing delay and low GPU utilization.", "Network latency for incoming requests is highly variable, and the serving system has no control over this external factor."], "correct_index": 2}}, {"id": "cloud-0131", "title": "The Chatbot Latency Crisis (cloud-0131)", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the 100ms static batching timeout is removed with continuous batching, what should happen to P99 TTFT and why?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 2}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Slightly worse, ~110ms. The smaller batch size has lower arithmetic intensity, reducing MFU and making each step slower, which dominates any queueing gains.", "~53ms. The average batch size is halved (32 -> 16), so the system's throughput is halved, and thus latency must also be halved.", "~105ms. The GPU is the fundamental bottleneck. Serving policy doesn't change the time it takes to compute a token, so the TTFT will remain the same.", "~1-5ms. The 100ms static batching timeout (T_queue) is eliminated. The new latency is simply the compute time of the prefill step, which is on the order of milliseconds."], "correct_index": 3}}, {"id": "cloud-0133", "title": "The Chatbot Lag Catastrophe", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the P99 TTFT violation with a 150ms static batching timeout and only 40% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100's memory bandwidth is insufficient, causing a bottleneck when loading model weights for each batch. We should use tensor parallelism to split the model across multiple GPUs.", "The static batching timeout is too short. We should increase it to 300ms to create larger, more efficient batches, which will increase the 40% GPU utilization.", "The system is experiencing head-of-line blocking due to static batching, where new requests are stuck waiting for long-running batches to complete. Switching to continuous batching would solve this.", "The PCIe bus is saturated, preventing the CPU from feeding data to the H100 fast enough, which explains the low 40% utilization."], "correct_index": 2}}, {"id": "cloud-0134", "title": "The Chatbot's Unresponsive Start", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the 500ms P99 TTFT when static batching uses batch size 32 and a fixed 400ms timeout?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Llama-70B model's per-token processing time is too high, and a smaller model is needed.", "The network connection between the load balancer and inference servers adds latency.", "The static batching window forces early-arriving requests to wait for the timeout, causing high queuing delay that violates the TTFT SLO.", "The H100's memory bandwidth is saturated, causing delays in loading model weights for each batch."], "correct_index": 2}}, {"id": "cloud-0135", "title": "The Translation Service Traffic Jam", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the P99 TTFT spikes above 500ms with a 100ms static batching timeout and 90% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 13B model exceeds the H100's SRAM cache, forcing 500ms roundtrips to HBM.", "The static batching timeout is causing head-of-line blocking. Switch to continuous batching.", "The 90% utilization proves the GPU is compute-bound. Downsize the model to 7B.", "The 100ms timeout is too short to assemble optimal batches. Increase it to 500ms."], "correct_index": 1}}, {"id": "cloud-0137", "title": "The LLM Metrics", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What specific generation metrics are we failing to monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0138", "title": "The Shadow GPU Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is shadow-testing an LLM so much more expensive than shadow-testing a traditional ML model, and how do you make it feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0139", "title": "The Guardrail Latency Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the PM wrong, and how do you fix it without removing the guardrail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0140", "title": "The Feature Store Consistency Trap", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 15% value divergence between the online Redis and offline Parquet feature paths?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0141", "title": "The P99 Latency Anomaly", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can TPOT stay stable while P99 TTFT explodes under static batching, and what is the root cause of the TTFT explosion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0142", "title": "The Continuous Batching Plateau", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What practical system constraint is preventing them from reaching the theoretical throughput gains from continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0143", "title": "Static Batching and TTFT SLO Failure", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can any nonzero static batch size meet the 400ms P99 TTFT SLO, and what does that imply for throughput?", "chain_ids": ["cloud-chain-auto-001-05"], "chain_positions": {"cloud-chain-auto-001-05": 2}, "chain_tiers": {"cloud-chain-auto-001-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0144", "title": "The ROI of Heterogeneity", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much throughput could continuous batching recover over static batching for a workload with 95% short and 5% 4k/4k long requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0145", "title": "The SLO Squeeze: Interactive vs. Batch Throughput", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you use priority scheduling or partition the 8 GPUs between chat and batch jobs, and what trade-off does that create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0146", "title": "The Continuous Batching Tail Latency Paradox", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are 4000-token prompts causing 1.5-2s P99 TTFT spikes in the continuous batching loop, and how would you mitigate them?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 1}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0147", "title": "The Throttling Dilemma: Per-User vs. Global Queueing", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do global FIFO and per-user fair queuing differ under a burst from one power user in a multi-tenant LLM API?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0148", "title": "The P99 Latency Volcano", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 latency exploding near 50% of max throughput with static batch size 8, and should you add GPUs or change batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0149", "title": "The Two-Tier Traffic Jam", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a single continuous-batching configuration fail to optimize both chat TTFT and summarization TPOT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0150", "title": "The Priority Queue Impasse", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do interactive users see multi-second TTFT in a FIFO continuous batcher at 100% GPU utilization, and what scheduling fix would you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0151", "title": "The KV Cache Thrashing Cascade", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this continuous-batched cluster compute-bound or KV-cache-capacity-bound, and what cascading failure explains the 30% RPS loss and P99 spikes?", "chain_ids": ["cloud-chain-auto-008-18"], "chain_positions": {"cloud-chain-auto-008-18": 1}, "chain_tiers": {"cloud-chain-auto-008-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0152", "title": "The Deadline-Missing Detector", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a system with 90% average utilization failing so badly, and what queueing strategy would you recommend?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 3}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0153", "title": "The Cannibalistic Batching Strategy", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the batching strategy cannibalizing its own efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0154", "title": "The Head-of-Line Blocking Crisis", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What flaw in the fixed batch size 8 static batching stack is causing poor P99 TTFT, and why is continuous batching better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0156", "title": "The Prefill vs. Decode Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might doubling batch size improve aggregate TPOT but harm TTFT and user experience for the code completion assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0159", "title": "The Coding Assistant's Latency Crisis", "topic": "activation-memory", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is driving the 1500ms P99 TTFT despite excellent throughput, and why is the 1000ms static batching timeout the culprit?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 3}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0160", "title": "The Tyranny of Throughput", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do high GPU utilization and TPS still produce lag under a 150ms static batching timeout with a 200ms TTFT SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0161", "title": "The Code Assistant's Latency Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What queueing failure causes the 13B code assistant to miss its 200ms P99 TTFT SLO despite fast per-request inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0162", "title": "The Real-Time Voice Assistant Stutter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is increasing static batch size from 8 to 32 the wrong fix for 40% GPU utilization, and what batching change should you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 13B model exceeds the H100's SRAM cache, forcing 500ms roundtrips to HBM.", "The static batching timeout is causing head-of-line blocking. Switch to continuous batching.", "The 90% utilization proves the GPU is compute-bound. Downsize the model to 7B.", "The 100ms timeout is too short to assemble optimal batches. Increase it to 500ms."], "correct_index": 1}}, {"id": "cloud-0163", "title": "The Black Friday Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did the system fail non-linearly, and how does continuous batching on GPUs shift this queueing knee point differently than traditional CPU-based web serving?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 4}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0164", "title": "The Input Chunking Pipeline Bubble", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does chunking the prompt slow down the time to first token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0166", "title": "The SLO-Violating Deadline Scheduler", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the deadline scheduler failing long jobs despite 50% utilization, and what statistical phenomenon is being ignored?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0167", "title": "The Speculative Decoding Memory Bomb", "topic": "speculative-decoding", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did an optimization designed to reduce latency cause a catastrophic memory failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0168", "title": "Head-of-Line Blocking in Continuous Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would a continuous batching strategy catastrophically degrade P99 TTFT for your most sensitive users?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0169", "title": "The Throughput-Optimized Cascade Failure", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did optimizing for a peak hardware metric (TPOT) lead to a catastrophic system-level failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0171", "title": "The In-Flight Priority Queue Failure", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is a priority queue insufficient for guaranteeing the SLA, and what physical constraint of the hardware is it failing to account for?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0172", "title": "The Continuous Batching Paradox", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can throughput be up, but latency for our most important users be so much worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0173", "title": "The TTFT vs. TPOT Tug-of-War", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why can't the 70B code assistant get both low TTFT at batch size 1 and excellent TPOT at large batch size, and what design can satisfy both?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0174", "title": "The Real-Time Queue Collapse", "topic": "graceful-degradation", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did useful throughput fall below baseline during the 3x surge, and what critical deadline-aware feature is missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0176", "title": "The SLO Violation Cascade", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why won't adding replicas fix 150ms chat TTFT when 100k-token summarization prefills run on the same H100 pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0177", "title": "The Continuous Batching Death Spiral", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did an optimization designed for high throughput result in a catastrophic latency failure, and what fundamental law of systems have your team forgotten?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0178", "title": "The High-Priority Queue Stall", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does an optimization designed to increase throughput lead to a catastrophic failure in latency, and what is the primary cause of the excessive delay?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0179", "title": "The Continuous Batching Stall", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What scheduling flaw in continuous batching causes severe TTFT spikes under high load, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0180", "title": "The Multi-Tenant SLO Crisis", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What serving architecture lets latency-sensitive Swift requests avoid being blocked by long Deep prefills on the same H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0181", "title": "The Phoenix False Positive", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you triage the Phoenix false drowsiness alerts and redesign the pipeline to prevent environmental skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0182", "title": "The AI Analyst's Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three scheduling and capacity decisions for serving real-time news and long report jobs simultaneously on a shared cluster without violating the 500ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0183", "title": "The Silent Utilization Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did the 70B service become cost-inefficient after shifting from long document summarization to short conversational queries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0184", "title": "The Earnings Call Meltdown", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does processing one transcription request at a time miss the 500ms P99 TTFT SLO, and what batching architecture fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0185", "title": "The Real-Time Translation Stalemate", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three design decisions to address this P99 latency explosion, and why are they the correct levers to pull?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0186", "title": "The 8 Petabyte Skew Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you redesign the log-to-training pipeline to handle 8 TB/day per vehicle and eliminate C++/Python training-serving skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0187", "title": "The SLA Collision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What serving architecture should replace static monolithic batches, and why is a priority queue insufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0188", "title": "The 'Laggy' Code Assistant: A Batching Design Challenge", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What serving architecture can meet StaffML Code's strict TTFT and TPOT SLOs without the static batching trade-off?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 3}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0189", "title": "The Copilot Latency Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does static batching miss the 200 ms P99 TTFT while leaving GPUs underutilized?", "chain_ids": ["cloud-chain-auto-001-05"], "chain_positions": {"cloud-chain-auto-001-05": 3}, "chain_tiers": {"cloud-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0190", "title": "The Conversational AI Traffic Jam", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is your core architectural choice for request handling and batching to meet a 500ms P99 TTFT with highly variable query lengths, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0191", "title": "The Bi-Modal GPU Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you serve both products on a single B200 without violating SLAs or thrashing GPU memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0192", "title": "The Two-Tier SLA Conundrum", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect serving for the two product tiers on 16 GPUs without one tier hurting the other's SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0193", "title": "The Blackwell Disappointment", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you diagnose and redesign the serving stack to achieve a 3x cost-per-token reduction without latency regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0194", "title": "The Interactive Coding Assistant SLO Catastrophe", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you prove static batching cannot meet the 500 ms TTFT and size capacity under continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0195", "title": "The Underutilized Accelerator", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the fundamental limitation of standard dynamic batching in this scenario, and what advanced technique maximizes GPU throughput and reduces tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0196", "title": "The OOMing Generator", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the most likely cause of these OOMs under high concurrency, and what technique mitigates this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0198", "title": "The Streaming vs Batch Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "When does token streaming reduce perceived latency for the 8-second chatbot response, and when can it hurt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0199", "title": "Sizing GPUs for a 335M Embedding Model", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a high-end GPU the right hardware for a 335M embedding model, and what batch size should you target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0200", "title": "The Warm-up Request Strategy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the first real request 60× slower after health check passes, and how many warm-up requests are needed before production traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0201", "title": "The Tokenizer Mismatch", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does a tokenizer version mismatch cause a massive GPU memory leak, and how does it destroy your serving economics?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 0}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0203", "title": "The Normalization Mismatch", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What's the most likely preprocessing bug?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0204", "title": "The Pre-computation Trade-off", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do we reduce inference compute costs without losing model accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0205", "title": "The Token Budget Economics", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which accelerator has the best cost per million tokens for serving a 70B LLM at realistic batch sizes, and when does that change?", "chain_ids": ["cloud-chain-auto-001-19"], "chain_positions": {"cloud-chain-auto-001-19": 0}, "chain_tiers": {"cloud-chain-auto-001-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0206", "title": "Warm 70B Replica vs Scale-to-Zero Break-Even", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is keeping one warm 70B replica 24/7 worth the $2,500/month, and what is the break-even versus scale-to-zero?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0207", "title": "The Triton Inference Server Ensemble", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do a Triton ensemble and a single Python process compare for latency and throughput in this RAG pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0208", "title": "The Structured Output Constraint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is this overhead acceptable, and can we reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0209", "title": "The A/B Testing at Scale", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is this statistically unnecessary but infrastructure-ruinous, and how does the GPU memory asymmetry between these models dictate your A/B testing architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0210", "title": "Token-Based Rate Limiting for LLM APIs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design a fair rate limiter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0211", "title": "The Structured Output Parsing Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the CPU doing that takes so much time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0212", "title": "The Continuous Batching Scheduler", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does continuous batching reduce the 13B model's static-batching latency, and what improvement should you expect?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 3}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0213", "title": "The GC Pause Latency Spike", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's stealing 760 ms from your GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0214", "title": "The TensorRT Incompatibility", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a driver update break a model file, and what's the correct deployment practice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0215", "title": "The BatchNorm Drift", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is silently changing to cause this accuracy drop without weight modifications, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0216", "title": "The One-Replica Meltdown", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is one slow replica destroying the P99 for the entire fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0217", "title": "The CPU Preprocessing Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where's the bottleneck causing the 15 tokens/s rate and sawtooth GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0218", "title": "The GIL Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the throughput plateau at 110 req/s regardless of how many replicas are added, and what is the ceiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0219", "title": "The Multi-Tenant GPU Sharing Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What went wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0220", "title": "Mixed SLA Head-of-Line Blocking in a Shared GPU Pool", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do autocomplete requests experience 4-second P99 latency on a shared GPU pool, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0223", "title": "The Serverless Inference Trade-off", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the cost break-even point between SageMaker Serverless and a dedicated A10G, and which option wins for this workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0224", "title": "The Continuous Batching Starvation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How did your batching algorithm starve the easy requests?", "chain_ids": ["cloud-chain-auto-001-06"], "chain_positions": {"cloud-chain-auto-001-06": 4}, "chain_tiers": {"cloud-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0225", "title": "The LLM Canary Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do standard infra metrics miss LLM quality regressions, and how could you use the GPU's KV-cache memory profile as a hardware-level canary signal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0226", "title": "The Multi-Model Serving Platform", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a multi-model serving platform that cuts the $2M/month GPU bill by at least 40% without violating SLAs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0227", "title": "The Speculative Decoding Speedup", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Walk me through the systems math for speculative decoding — when does it help, and when does it backfire?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0228", "title": "The KV-Cache OOM Attack", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does a 64x increase in prompt length from a few users crash the entire cluster, and what's your emergency response?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0229", "title": "The Batching Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do we fix this idle capacity caused by static batching in an LLM API?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0230", "title": "The Speculative Decoding Accept Rate Crash", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why did the Speculative Decoding optimization become a performance penalty for the coding assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0231", "title": "The KV-Cache Context Explosion", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is this physically impossible?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0232", "title": "The Inference Cost Attribution Puzzle", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a fair inference cost attribution model for the five LLMs on the shared 64-A100 vLLM pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0233", "title": "The Speculative Memory Trade-off", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding degrade throughput at high batch sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0235", "title": "Handling KV Cache Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does PagedAttention specifically solve this issue in the KV cache?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 0}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It compresses the key and value tensors using quantization before storing them in the cache.", "It partitions the KV cache into non-contiguous fixed-size blocks, eliminating external fragmentation and allowing dynamic memory allocation per token.", "It proactively evicts the least recently used KV cache tensors to free up contiguous memory blocks for new requests.", "It offloads the KV cache to CPU RAM when GPU memory is fragmented and prefetches it when needed."], "correct_index": 1}}, {"id": "cloud-0236", "title": "Advantages of Continuous Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How does continuous batching improve throughput compared to static batching?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 1}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It increases the clock speed of the GPU dynamically.", "It allows new requests to join the active batch as soon as other sequences complete.", "It batches all requests that have the exact same prompt length together.", "It caches the output tokens to reuse them."], "correct_index": 1}}, {"id": "cloud-0237", "title": "Mechanism of Speculative Decoding", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the fundamental mechanism that allows it to achieve speedup?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 0}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The target model is used only for the first few tokens, and the draft model completes the rest of the sequence to save time.", "The draft model generates multiple tokens sequentially, which the target model then verifies in a single parallel forward pass, accepting correct tokens and correcting the first divergence.", "The draft model and target model generate tokens in parallel, and a majority vote decides which token to output.", "The draft model continuously fine-tunes the target model during inference to make it generate tokens faster."], "correct_index": 1}}, {"id": "cloud-0238", "title": "Applying Little's Law to Inference Servers", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "With lambda = 10 requests/s and W = 5 seconds, how many concurrent requests L must the LLM server support to remain stable?", "chain_ids": ["cloud-chain-auto-011-02"], "chain_positions": {"cloud-chain-auto-011-02": 0}, "chain_tiers": {"cloud-chain-auto-011-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server needs to support 5 concurrent requests at any given time.", "The server needs to support 15 concurrent requests to have a buffer for peak loads.", "The server needs to support an average of 50 concurrent requests in the system.", "The server needs to support 2 concurrent requests, as 10 divided by 5 is 2."], "correct_index": 2}}, {"id": "cloud-0239", "title": "The Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does the 'ridge point' on a GPU roofline model represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The maximum memory bandwidth (TB/s) of the GPU.", "The maximum theoretical compute performance (TFLOPS).", "The minimum arithmetic intensity (FLOPs/Byte) required to be compute-bound.", "The power consumption (Watts) when the GPU is idle."], "correct_index": 2}}, {"id": "cloud-0240", "title": "Identifying the Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "With arithmetic intensity 40 FLOPs/Byte on an H100 ridge point of 295 FLOPs/Byte, is the workload compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 134 TFLOPS is lower than 989 TFLOPS.", "Memory-bound, because its arithmetic intensity is below the ridge point.", "Compute-bound, because it utilizes less than 15% of peak compute.", "Memory-bound, because 40 FLOPs/Byte exceeds the PCI-e bandwidth limit."], "correct_index": 1}}, {"id": "cloud-0241", "title": "The Role of HBM Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In the context of the GPU roofline model, what aspect of performance does the HBM (High Bandwidth Memory) bandwidth primarily determine?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 0}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The flat part of the roofline (the peak TFLOPS ceiling).", "The total capacity (GB) of the GPU's memory.", "The slope of the roofline for memory-bound workloads.", "The latency (in nanoseconds) of a single L1 cache access."], "correct_index": 2}}, {"id": "cloud-0242", "title": "The 70B Parameter Litmus Test", "topic": "activation-memory", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much memory do the FP16 weights of a 70B parameter model require, and does it fit on a single 80 GB NVIDIA H100?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 1}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["70 GB, so it fits with 10 GB to spare.", "1120 GB, so it does not fit.", "140 GB, so it does not fit.", "280 GB, so it does not fit."], "correct_index": 2}}, {"id": "cloud-0243", "title": "The B200's Architectural Balance", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the B200's ridge point from 2,250 TFLOPS and 8.0 TB/s, and what does it imply for efficient workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~295 Ops/Byte, because it's similar to the H100.", "~2,250 Ops/Byte, assuming bandwidth was specified in Tb/s.", "~0.0035 Bytes/Op, because the ratio was inverted.", "~281 Ops/Byte, indicating it's a compute-bound architecture."], "correct_index": 3}}, {"id": "cloud-0244", "title": "The Chinchilla Time Tax", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Assuming 100% hardware utilization, how long would it take to complete this 5.8x10^23 FLOP training run on a single NVIDIA H100 GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~6.8 days.", "~6,800 days.", "~1,130 days.", "~2,980 days."], "correct_index": 1}}, {"id": "cloud-0245", "title": "The Optimizer's Tax", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much VRAM is needed just for Adam optimizer states and gradients for a 70B-parameter LLM, excluding model weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 GB", "280 GB", "700 GB", "1120 GB"], "correct_index": 2}}, {"id": "cloud-0246", "title": "The Voracious KV-Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary driver of KV-cache memory size when serving an LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The number of model parameters (P)", "The batch size (B)", "The input sequence length (S)", "The GPU's memory bandwidth"], "correct_index": 2}}, {"id": "cloud-0247", "title": "The Activation Memory Bubble", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In what scenario are model activations most likely to exceed model weights in memory usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Serving a 13B LLM with a short context length.", "Training a large CNN on high-resolution images with a large batch size.", "Fine-tuning a 1B parameter model with a small batch size.", "Running inference with a quantized MobileNet on a single image."], "correct_index": 1}}, {"id": "cloud-0248", "title": "The Adam Optimizer Memory Footprint", "topic": "extreme-quantization", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Will the model and optimizer states fit into the 80 GB memory if you ignore activations and KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it fits. It only needs ~16 GB. (Weights Only Trap)", "Yes, it fits. It needs ~64 GB. (Missing Gradients Trap)", "No, it does not fit. It needs ~128 GB.", "Yes, it fits. It needs ~32 GB. (Missing Adam State Trap)"], "correct_index": 2}}, {"id": "cloud-0250", "title": "The Datacenter Power Wall", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate Thermal Design Power (TDP) of a single modern datacenter GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~150 W", "~350 W", "~700 W", "~1200 W"], "correct_index": 2}}, {"id": "cloud-0251", "title": "The Energy Cost of Solitude", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much more energy per request does batch size 1 use than batch size 32 on a 700W H100 with 10ms vs 60ms batch latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's about the same, since power is the same.", "It's about 2x more energy per request.", "It's over 5x more energy per request.", "It's over 30x more energy per request."], "correct_index": 2}}, {"id": "cloud-0253", "title": "Identifying a Memory-Bound Workload", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does this tell you about the workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The workload is compute-bound.", "The workload has an inefficient implementation that should be discarded.", "The workload is memory-bound.", "The workload is perfectly optimized."], "correct_index": 2}}, {"id": "cloud-0254", "title": "Calculating GEMM Kernel Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of this 200 GFLOP GEMM, and is it memory-bound on an H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 FLOPs/Byte, memory-bound", "0.0075 Bytes/FLOP, compute-bound", "133.3 FLOPs/Byte, memory-bound", "400 FLOPs/Byte, compute-bound"], "correct_index": 2}}, {"id": "cloud-0255", "title": "The H100's Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the ridge point from 989 TFLOPS and 3.35 TB/s, and what does it mean for kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.0034 FLOPs/Byte", "295.2 FLOPs/Byte", "295,200 FLOPs/Byte", "3.39 FLOPs/Byte"], "correct_index": 1}}, {"id": "cloud-0256", "title": "The 16x VRAM Multiplier for Training", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much VRAM is required just to store the FP32 weights, gradients, and Adam optimizer states for a 70B-parameter LLM?", "chain_ids": ["cloud-chain-auto-008-16"], "chain_positions": {"cloud-chain-auto-008-16": 0}, "chain_tiers": {"cloud-chain-auto-008-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~280 GB", "~560 GB", "~1120 GB", "~140 GB"], "correct_index": 2}}, {"id": "cloud-0258", "title": "The Mixed Precision Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With mixed-precision Adam for a 7B model, what persistent memory is required, and does it fit on an 80GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["56 GB", "84 GB", "98 GB", "112 GB"], "correct_index": 3}}, {"id": "cloud-0259", "title": "The MoE Compute Fallacy", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How should you estimate Chinchilla-optimal training compute for this 1T-parameter MoE with 2 of 10 experts active per token?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 0}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.2 x 10^26 FLOPs", "~8.0 x 10^23 FLOPs", "~4.8 x 10^24 FLOPs", "~2.4 x 10^25 FLOPs"], "correct_index": 2}}, {"id": "cloud-0260", "title": "The Chinchilla Compute Budget", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a Chinchilla-optimal 100B dense transformer, how many training tokens and FLOPs should you budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4.0 x 10^23 FLOPs", "1.2 x 10^23 FLOPs", "1.2 x 10^24 FLOPs", "6.0 x 10^22 FLOPs"], "correct_index": 2}}, {"id": "cloud-0262", "title": "The TOPS/W Efficiency Metric", "topic": "energy-per-operation", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does TOPS/W mean, and why is it critical for a datacenter architect managing thousands of GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's a measure of peak performance, and higher is always better.", "It's primarily a concern for battery-powered mobile devices, not datacenters.", "It measures compute efficiency, which directly impacts power and cooling costs at scale.", "It defines the maximum thermal output a GPU can sustain before throttling."], "correct_index": 2}}, {"id": "cloud-0264", "title": "The HBM Latency Penalty", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate latency you should recall for a single memory access to HBM3 on a modern datacenter GPU?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 0}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4 ns", "~300 ns", "~1,000 ns (1 µs)", "~40 ns"], "correct_index": 1}}, {"id": "cloud-0265", "title": "The Energy Cost of Precision: Extreme Quantization", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure energy consumption perspective, what is the approximate energy savings per operation if you can perform the compute using INT8 versus FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x more energy", "~3.4x more energy", "~18x more energy", "The energy is roughly the same"], "correct_index": 2}}, {"id": "cloud-0266", "title": "The Inference Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory do the 70B model weights require in FP16 versus INT8, and how many GB are saved by quantizing?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 1}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 GB. (Misconception: Assumed FP32 baseline, but then confused weight footprint calculation)", "210 GB. (Misconception: Assumed FP32 baseline of 280GB and INT8 of 70GB, resulting in 210GB savings)", "70 GB. (Correct calculation: 140 GB for FP16 minus 70 GB for INT8)", "280 GB. (Misconception: Calculated total FP32 model size instead of savings)"], "correct_index": 2}}, {"id": "cloud-0267", "title": "Quadratic Cost of Dense Self-Attention", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does standard dense self-attention compute scale as sequence length n increases?", "chain_ids": ["cloud-chain-auto-secondary-013-03"], "chain_positions": {"cloud-chain-auto-secondary-013-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["O(n)", "O(n log n)", "O(n^2)", "O(1)"], "correct_index": 2}}, {"id": "cloud-0268", "title": "The Chinchilla Data Budget", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "According to the compute-optimal Chinchilla scaling laws, approximately how many training tokens should you plan to acquire?", "chain_ids": ["cloud-chain-auto-secondary-013-03"], "chain_positions": {"cloud-chain-auto-secondary-013-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 Trillion tokens", "420 Billion tokens", "1.4 Trillion tokens", "120 Billion tokens"], "correct_index": 2}}, {"id": "cloud-0269", "title": "The 700W Question", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate Thermal Design Power (TDP) you should state for a single modern datacenter GPU, like the NVIDIA H100, to correctly inform the datacenter facilities team?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 0}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["150 W", "350 W", "700 W", "2000 W"], "correct_index": 2}}, {"id": "cloud-0270", "title": "The Datacenter Rack Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many one-H100 servers can fit in a 70 kW rack when each uses a 700W GPU, 300W non-GPU, and the PUE is 1.1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 GPUs. (Calculation: 70,000W / 700W)", "70 GPUs. (Calculation: 70,000W / 1,000W)", "63 GPUs. (Calculation: 70,000W / 1,100W)", "90 GPUs. (Calculation: 70,000W / 770W)"], "correct_index": 2}}, {"id": "cloud-0271", "title": "The Fusion Bottleneck", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When an ML compiler performs operator fusion, what is the primary hardware bottleneck it is designed to reduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Total computational FLOPs", "Cross-node network traffic", "HBM Memory Bandwidth", "Model storage size on disk"], "correct_index": 2}}, {"id": "cloud-0272", "title": "The GPU Failure Cadence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Given a typical Mean Time To Failure (MTTF) of 50,000 hours for a single GPU, what is the expected frequency of a GPU failure somewhere in your fleet?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 0}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About once a month", "About once a week", "About once every 5 hours", "About once every 50,000 hours"], "correct_index": 2}}, {"id": "cloud-0273", "title": "Expected GPU Failures in a 10,000-GPU, 30-Day Training Run", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How many GPU failures should you budget for during a 30-day training run on a 10,000-GPU cluster?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 1}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Approximately 6 failures. Calculated by dividing the 30-day run by the 5-hour failure interval.", "Less than 1. The 50,000-hour MTTF of a single GPU makes a failure within a 720-hour run extremely unlikely.", "Approximately 144 failures. The 720-hour run will see a failure roughly every 5 hours.", "About 3,600 failures. Calculated by multiplying the 720-hour run by the 5-hour failure interval."], "correct_index": 2}}, {"id": "cloud-0274", "title": "The Data Loading Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How long will one epoch's 10 TB read from a ~7 GB/s NVMe SSD take, and what bottleneck does that create for the GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~19 hours", "~3.4 minutes", "~24 minutes", "~3.25 hours"], "correct_index": 2}}, {"id": "cloud-0275", "title": "The Iceberg of ML Costs", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over three years, which cost is likely to dominate TCO: the one-time training run or continuous inference, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time model training cost.", "The cumulative cost of running inference.", "The cost of data acquisition and labeling.", "The salaries of the R&D and engineering teams."], "correct_index": 1}}, {"id": "cloud-0276", "title": "The CapEx vs. TCO Fallacy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the estimated first-year TCO for 10 H100s including hardware ($30K/GPU) and 5% maintenance, and why is sticker price misleading?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$300,000", "$301,500", "$315,000", "$330,000"], "correct_index": 2}}, {"id": "cloud-0277", "title": "Defining Arithmetic Intensity: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the definition of Arithmetic Intensity in the context of a GPU roofline model?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The number of operations per second (FLOPs/sec)", "The number of operations per Watt (FLOPs/Watt)", "The ratio of operations to data movement (FLOPs/Byte)", "The total memory bandwidth (GB/s)"], "correct_index": 2}}, {"id": "cloud-0278", "title": "The Meaning of TOPS/W", "topic": "energy-per-operation", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does the TFLOPS/W metric primarily allow you to calculate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The peak theoretical speed of a single GPU", "The compute performance per unit of power, indicating efficiency", "The latency of a single operation", "The speed of the memory subsystem"], "correct_index": 1}}, {"id": "cloud-0279", "title": "The Batch Size 1 Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this ResNet-50 batch-size-1 forward pass compute-bound or memory-bound, and how do you determine that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 250 FLOPs/Byte is a high arithmetic intensity.", "Memory-bound, because 0.004 Bytes/FLOP is far above the inverse ridge point.", "Memory-bound, because the AI of 250 FLOPs/Byte is less than the H100's ridge point of ~295 FLOPs/Byte.", "Compute-bound, because with 8 GFLOPs of work, the compute units will be the bottleneck."], "correct_index": 2}}, {"id": "cloud-0280", "title": "The INT8 Energy Dividend", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate energy consumption ratio between a single FP32 operation and a single INT8 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["An FP32 op consumes ~4x more energy than INT8", "An FP32 op consumes ~3.4x more energy than INT8", "An FP32 op consumes ~18x more energy than INT8", "An FP32 op consumes ~580x more energy than INT8"], "correct_index": 2}}, {"id": "cloud-0282", "title": "The Intra-Node Speed Advantage", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the approximate latencies of NVLink 4.0 within an HGX server and cross-rack InfiniBand NDR, and how do they compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink: ~500 ns, InfiniBand: ~1,000 ns", "NVLink: ~5,000 ns, InfiniBand: ~500 ns", "NVLink: ~500 ns, InfiniBand: ~5,000 ns", "NVLink: ~1 ns, InfiniBand: ~5,000 ns"], "correct_index": 2}}, {"id": "cloud-0283", "title": "The I/O-Bound Cost Fallacy", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Assuming the job is entirely bottlenecked by loading the data from storage, what is the cost of one run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$0.28", "$8,200.00", "$2.28", "$18.22"], "correct_index": 2}}, {"id": "cloud-0286", "title": "The Roofline Litmus Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What metric determines whether an H100 workload is compute-bound or memory-bound?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Total parameter count of the model", "The model theoretical peak throughput (TFLOPS)", "Arithmetic Intensity (FLOPs per Byte)", "Power efficiency in TOPS/Watt"], "correct_index": 2}}, {"id": "cloud-0288", "title": "The Quantization Memory Dividend", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "After quantizing FP16 model weights to INT8 for inference, what is the approximate reduction factor in memory usage?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 0}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x reduction", "~1.5x reduction", "~2x reduction", "~8x reduction"], "correct_index": 2}}, {"id": "cloud-0290", "title": "The Great Interconnect Divide: NVLink vs. InfiniBand", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate raw bandwidth difference between NVLink 4.0 and InfiniBand NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They have roughly the same bandwidth.", "NVLink 4.0 is about 2x faster than InfiniBand NDR.", "NVLink 4.0 is about 18x faster than InfiniBand NDR.", "InfiniBand NDR is about 4x faster than NVLink 4.0."], "correct_index": 2}}, {"id": "cloud-0291", "title": "The INT8 Inference Memory Footprint", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the bare minimum VRAM required to load a 7-billion parameter Llama model's INT8 weights for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["112 GB", "14 GB", "7 GB", "700 MB"], "correct_index": 2}}, {"id": "cloud-0293", "title": "The Core Motivation for Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a privacy perspective, what is the primary motivation for Federated Learning?", "chain_ids": ["cloud-chain-auto-016-09"], "chain_positions": {"cloud-chain-auto-016-09": 0}, "chain_tiers": {"cloud-chain-auto-016-09": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0295", "title": "The Arithmetic Intensity Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Will this operation's performance be primarily limited by the GPU's compute power or its memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because it's a mathematical operation on a powerful GPU.", "Memory-bound, because the ratio of compute to data movement is very low.", "Neither, it's bound by NVLink bandwidth.", "It depends entirely on the size of the vector."], "correct_index": 1}}, {"id": "cloud-0296", "title": "The Arithmetic Intensity Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this preprocessing kernel with 10 TFLOPs per 500 GB read compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 10 TFLOPs is a very large number of operations that should saturate the GPU.", "Memory-bound, because its Arithmetic Intensity of ~20 Ops/Byte is far below the H100's ridge point of ~295 Ops/Byte.", "Network-bound, because transferring 500 GB of data is the bottleneck, regardless of computation.", "It's impossible to tell without knowing the kernel's execution time in milliseconds."], "correct_index": 1}}, {"id": "cloud-0298", "title": "The Datacenter Cooling Tax", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With 8 GPUs at 700 W each and PUE 1.15, what total rack power is consumed including cooling overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5.60 kW", "4.87 kW", "6.44 kW", "7.55 kW"], "correct_index": 2}}, {"id": "cloud-0300", "title": "The 7B Inference Memory Check", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the minimum VRAM required to simply load the model's weights in FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "14 GB", "28 GB", "112 GB"], "correct_index": 1}}, {"id": "cloud-0301", "title": "The Lifecycle TCO Inversion", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over a multi-year lifecycle, how does the one-time training cost typically compare with the cumulative inference cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Training cost is dominant, typically 5-10x greater than cumulative inference cost.", "The costs are roughly equal (a 1:1 ratio).", "Cumulative inference cost is dominant, typically 5-10x greater than training cost.", "The costs are unrelated (CapEx vs. OpEx) and not directly comparable."], "correct_index": 2}}, {"id": "cloud-0302", "title": "Identifying the Roofline's Axis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which fundamental metric determines whether an H100 workload is compute-bound or memory-bound, and what is its definition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ratio of Bytes transferred per FLOP (Bytes/FLOP).", "The raw throughput of the GPU in TFLOPS.", "The power efficiency of the GPU in TOPS/W.", "The ratio of FLOPs performed per Byte of memory transferred (FLOPs/Byte)."], "correct_index": 3}}, {"id": "cloud-0304", "title": "The Data Center Rack Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "With a 70 kW rack power limit and PUE 1.1, how many 700W GPUs can safely operate in the rack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 GPUs. (Calculated by 70,000W / 700W)", "110 GPUs. (Calculated by 70,000W × 1.1 / 700W)", "90 GPUs. (Calculated by 70,000W / 1.1 / 700W)", "83 GPUs. (Calculated by 70,000W / 1.2 / 700W)"], "correct_index": 2}}, {"id": "cloud-0305", "title": "The Blue-Green Pull Time", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how long does it take for a single server to download this container image over a modern datacenter network like InfiniBand NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 20 minutes", "About 1 minute", "About 12 seconds", "About 3 seconds"], "correct_index": 3}}, {"id": "cloud-0306", "title": "The RAG Pod Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Assuming both inference models use FP16 weights, what total GPU memory is needed to load the 300M embedding model and 7B LLM?", "chain_ids": ["cloud-chain-auto-016-02"], "chain_positions": {"cloud-chain-auto-016-02": 1}, "chain_tiers": {"cloud-chain-auto-016-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14.0 GB", "7.3 GB", "14.6 GB", "116.8 GB"], "correct_index": 2}}, {"id": "cloud-0307", "title": "The TCO Iceberg: TCO & Cost Modeling", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over the model's expected 3-year production lifespan, which component is most likely to dominate the Total Cost of Ownership (TCO)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The initial $1M training hardware cost.", "The electricity cost for the inference servers over 3 years.", "The salaries of the engineering team operating the service for 3 years.", "The network bandwidth costs for serving user traffic."], "correct_index": 2}}, {"id": "cloud-0309", "title": "The Ridge Point Rule", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the name of this ratio, and what fundamental bottleneck does being memory-bound signify?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's called Throughput, and it means the GPU clock speed is too low.", "It's called Arithmetic Intensity, and it means the workload is bottlenecked by memory bandwidth.", "It's called Latency, and it means the PCI-e bus is saturated.", "It's called Arithmetic Intensity, and it means the workload is bottlenecked by the number of CUDA cores."], "correct_index": 1}}, {"id": "cloud-0311", "title": "The Real-Time Transcription Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "To meet this real-time constraint, what is maximum average Time Per Output Token (TPOT) your system can have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["15 ms/token", "1000 ms/token", "67 ms/token", "15 tokens/sec"], "correct_index": 2}}, {"id": "cloud-0313", "title": "The Node vs. The Network Latency Gap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a cross-rack InfiniBand transfer compared to an on-node NVLink transfer in terms of pure latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly the same speed (within 2x)", "About 10x slower", "About 100x slower", "InfiniBand is faster than NVLink"], "correct_index": 1}}, {"id": "cloud-0319", "title": "The Cross-Rack Divide", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To connect the different server nodes together, which of the following technologies is the standard choice for the high-bandwidth, low-latency fabric?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink 4.0", "PCIe Gen5", "InfiniBand NDR", "HBM3"], "correct_index": 2}}, {"id": "cloud-0321", "title": "The Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the industry-standard rule of thumb you should use to estimate the annual hardware maintenance costs, as a percentage of the initial capital expenditure (CapEx)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.5%", "5%", "25%", "50%"], "correct_index": 1}}, {"id": "cloud-0322", "title": "The A/B Test Power Bill", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you calculate the total electricity cost for the 30-day A/B test, including PUE, and what is the final number?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$108.00", "$118.80", "$83.16", "$118,800.00"], "correct_index": 1}}, {"id": "cloud-0323", "title": "The Datacenter Cooling Tax (PUE)", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a realistic PUE value for such a facility, and what does it physically represent?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 0}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["PUE ≈ 2.0, meaning you need 1W of cooling for every 1W of compute.", "PUE ≈ 1.1, meaning you need 0.1W of cooling for every 1W of compute.", "PUE ≈ 1.0, meaning cooling is nearly free and consumes no extra power.", "PUE ≈ 0.9, meaning the cooling system generates its own power."], "correct_index": 1}}, {"id": "cloud-0325", "title": "The Memory-Bound Vision Model", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the 50-TFLOP, 200-GB forward pass on an H100 compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 50 TFLOPs is a very high operational count that will saturate the execution units.", "Compute-bound, because its Arithmetic Intensity (250 Ops/Byte) is a high number, close to the hardware's peak.", "Memory-bound, because its Arithmetic Intensity (250 Ops/Byte) is less than the H100's ridge point (~295 Ops/Byte).", "Memory-bound, because it requires 4 Bytes per FLOP, which is too much data for the memory system."], "correct_index": 2}}, {"id": "cloud-0326", "title": "The Hidden Cost of Cooling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a PUE of 1.2 affect one 700W H100's power draw, and what is its daily electricity cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$1.68 per day", "~$1.40 per day", "~$2.02 per day", "~$201.60 per day"], "correct_index": 2}}, {"id": "cloud-0327", "title": "The Blue/Green Memory Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much GPU memory is required for FP16 weights during the blue/green rollout of the 7B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "7 GB", "28 GB", "56 GB"], "correct_index": 2}}, {"id": "cloud-0329", "title": "The TCO of Privacy: Centralized vs. Federated", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How should you compare the two training architectures over the 3-year project to choose the more economical one?", "chain_ids": ["cloud-chain-auto-016-09"], "chain_positions": {"cloud-chain-auto-016-09": 1}, "chain_tiers": {"cloud-chain-auto-016-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper with a TCO of $1.5M, vs. FL at $2.3M. (Error: Calculates for only 1 year)", "Federated Learning is cheaper with a TCO of $900k, vs. Centralized at $3M. (Error: Ignores CapEx)", "Federated Learning is cheaper with a 3-year TCO of $2.9M, vs. Centralized at $3.5M.", "Centralized is cheaper with a TCO of $2.0M, vs. FL at $3.9M. (Error: Mixes up CapEx and OpEx)"], "correct_index": 2}}, {"id": "cloud-0332", "title": "The RAG Rollout Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory will this new model consume just for the model weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "56 GB", "14 GB", "28 GB"], "correct_index": 2}}, {"id": "cloud-0333", "title": "The Annual Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on industry rules of thumb, what is the approximate annual maintenance cost you should budget for this server, separate from power and operational costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$1,500", "~$4,800", "~$24,000", "~$84,000"], "correct_index": 2}}, {"id": "cloud-0334", "title": "The CapEx vs. OpEx Blind Spot", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Using the standard hardware constants, what is the approximate TCO for this pod?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$240,000", "$276,000", "$297,000", "$258,000"], "correct_index": 2}}, {"id": "cloud-0336", "title": "The PUE Tax", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does PUE represent, and what total grid power does the 8-GPU server draw with a PUE of 1.1?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 1}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5.60 kW", "0.77 kW", "6.16 kW", "6.72 kW"], "correct_index": 2}}, {"id": "cloud-0337", "title": "The On-Node vs. Off-Node Interconnect Gap", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much faster is a typical NVLink transfer compared to a cross-rack InfiniBand transfer in terms of latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x faster", "About 10x faster", "About 100x faster", "They are roughly the same speed"], "correct_index": 1}}, {"id": "cloud-0339", "title": "The Maintenance Tax TCO", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the standard rule of thumb for estimating annual hardware maintenance cost as a percentage of initial CapEx?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0340", "title": "The Cloud vs. Hybrid TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the annual fleet cloud compute costs under Scenarios A and B, and which is more cost-effective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Scenario A is more cost-effective at $1.2 Million.", "Scenario A is more cost-effective at $8.0 Million.", "Scenario B is more cost-effective at $2.4 Million.", "Scenario B is more cost-effective at $2,400."], "correct_index": 2}}, {"id": "cloud-0344", "title": "The TCO Blindspot: Hardware Maintenance", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Using standard industry rules of thumb, what is the approximate annual maintenance cost for a single H100 server with about $30,000 CapEx?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$150", "$1,500", "$3,000", "$6,000"], "correct_index": 1}}, {"id": "cloud-0348", "title": "The Kernel Launch Overhead Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the total latency of the fused kernel after combining the 10 µs MatMul and 2 µs ReLU with one 5 µs launch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12 µs", "22 µs", "17 µs", "15 µs"], "correct_index": 2}}, {"id": "cloud-0349", "title": "The Deceptive Addition", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of C = A + B for FP16 tensors, and is it memory-bound or compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["AI is ~0.5 FLOPs/Byte, making it compute-bound. (Forgetting to include the read bytes for B and write bytes for C).", "AI is ~0.5 FLOPs/Byte, making it memory-bound. (Forgetting to include the write bytes for C).", "AI is ~0.25 FLOPs/Byte, making it memory-bound. (Calculating 1 FLOP / 4 Bytes by forgetting FP16 size for C).", "AI is ~0.167 FLOPs/Byte, making it memory-bound."], "correct_index": 3}}, {"id": "cloud-0350", "title": "The Rack Density Limit", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If the 70 kW rack limit is the total rack power budget, how many H100s fit before exceeding cooling capacity, and how does reserving 20% for non-GPU system overhead change that number?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 1}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["80 GPUs", "100 GPUs", "700 GPUs", "10 GPUs"], "correct_index": 0}}, {"id": "cloud-0352", "title": "The OTA Update Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much storage is required for the 7B model's FP16 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "112 GB", "14 GB", "1.75 GB"], "correct_index": 2}}, {"id": "cloud-0355", "title": "The Arithmetic Intensity Litmus Test: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on the principles of the roofline model, what is the primary performance bottleneck for this workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory bandwidth, because the arithmetic intensity is far below the GPU's ridge point.", "Peak TFLOPS, because the model isn't doing enough operations to keep the cores busy.", "L2 cache latency, because the working data set doesn't fit in L1 cache.", "The power limit (TDP), because the GPU cannot draw enough power to run faster."], "correct_index": 0}}, {"id": "cloud-0356", "title": "H100: Compute or Memory Bound?", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this H100 kernel with 20 GFLOPs of work and 200 MB of HBM reads compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. Its Arithmetic Intensity is high, so performance is limited by the 989 TFLOPS of compute.", "Memory-bound. Its Arithmetic Intensity is 0.1 FLOPs/Byte, which is far below the H100's ridge point.", "Memory-bound. Its Arithmetic Intensity is 100 FLOPs/Byte, which is below the H100's ridge point of ~295 FLOPs/Byte.", "Compute-bound. Any operation performing 20 GFLOPs is inherently compute-intensive and will be limited by core speed, not memory."], "correct_index": 2}}, {"id": "cloud-0357", "title": "The Over-provisioned AI Rack", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What happens to 90 H100s in a 60 kW rack, and what sustained performance fraction can they reach versus theoretical peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The rack will run at 100% performance because each individual GPU's 700W TDP is well below the 60 kW rack limit.", "85 of the GPUs will run at 100% and the remaining 5 will be cleanly shut down to exactly meet the 60 kW budget.", "The rack is over-provisioned and will be throttled to ~95% of its peak performance.", "The rack is severely over-provisioned and will only run at about 80% of its peak performance."], "correct_index": 2}}, {"id": "cloud-0358", "title": "The RAG Fleet Update Bill", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What total data egress is billed to push a 500 MB vector index update to 10,000 vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["500 MB (Misconception: Calculates for a single device, ignoring the fleet)", "140 TB (Misconception: Calculates using the 14GB model size instead of the 500MB index size)", "5 TB", "50 TB (Misconception: Unit conversion or arithmetic error)"], "correct_index": 2}}, {"id": "cloud-0359", "title": "The CapEx Baseline", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate unit cost for a single NVIDIA H100 GPU, which forms the baseline for your Total Cost of Ownership (TCO) calculation?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 0}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$3,000 (Assuming consumer-grade hardware pricing)", "~$10,000 (Assuming previous generation A100 pricing)", "~$30,000 (Accurate H100 enterprise baseline)", "~$240,000 (Mistaking an 8-GPU HGX baseboard for a single GPU)"], "correct_index": 2}}, {"id": "cloud-0360", "title": "The Federated Learning Data Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much gradient data will 10 million users upload per week at 200 MB each, and what does that imply for backend scale?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 0}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2 Terabytes", "200 Terabytes", "2 Petabytes", "20 Petabytes"], "correct_index": 2}}, {"id": "cloud-0366", "title": "The H100 Unit Cost: TCO & Cost Modeling", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate unit cost of a single NVIDIA H100 GPU, a standard component for large-scale AI training in the cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$3,000 (Confuses datacenter GPU with a high-end consumer card)", "~$10,000 (Underestimates the premium for enterprise-grade features)", "~$30,000", "~$100,000 (Confuses the cost of a single GPU with a fully-equipped server)"], "correct_index": 2}}, {"id": "cloud-0367", "title": "The TCO of Privacy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What annual data-transfer cost would centralizing 10M users' 1MB/day create, and how much would FL avoid?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$71 per year", "$200 per year", "$73,000 per year", "$584,000 per year"], "correct_index": 2}}, {"id": "cloud-0369", "title": "The H100 Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the FP16 ridge point for an H100 GPU, and what does that number say about workload performance?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.003 Ops/Byte. This implies almost any workload is compute-bound.", "~37 Ops/Byte. This results from incorrectly using bits instead of bytes for memory bandwidth.", "~295 Ops/Byte. A workload's arithmetic intensity must exceed this to be compute-bound.", "~1,342 Ops/Byte. This is the ridge point for an edge device (Jetson AGX Orin), not a datacenter GPU."], "correct_index": 2}}, {"id": "cloud-0372", "title": "The Iceberg of Inference Costs", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When analyzing the Total Cost of Ownership (TCO) for this model, which of the following components typically contributes the most to the total cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time cost of the initial model training run.", "The salaries of the ML engineering and research team.", "Ongoing inference costs (server hosting, power, maintenance).", "The cost of data acquisition, cleaning, and labeling."], "correct_index": 2}}, {"id": "cloud-0374", "title": "The Power Efficiency Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Within a 30W TDP budget, which accelerator delivers higher sustained INT8 throughput, and what effective TOPS can it sustain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Hailo-8, because its TOPS/W efficiency is over 2x higher.", "Jetson Orin, because its peak performance (275 TOPS) is the highest.", "Jetson Orin, because at 30W it can sustain ~137 TOPS, while the Hailo-8 is capped at 26 TOPS.", "Hailo-8, because with its higher efficiency it can deliver 312 TOPS (10.4 TOPS/W * 30W) in a 30W budget."], "correct_index": 2}}, {"id": "cloud-0375", "title": "The 70kW Rack Limit", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many 700W H100 GPUs can you safely install in a 70 kW rack when PUE is 1.2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 GPUs", "120 GPUs", "83 GPUs", "87 GPUs"], "correct_index": 2}}, {"id": "cloud-0377", "title": "The RAG Rollback Reflex", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the standard, most immediate operational response to this incident?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Begin a root-cause analysis by inspecting the new data file for corruption.", "Take the entire chatbot system offline to prevent more bad answers.", "Immediately trigger an automated rollback to the previous version of the vector database.", "Fine-tune the base LLM with the new information to teach it the correct behavior."], "correct_index": 2}}, {"id": "cloud-0380", "title": "The Datacenter Heat Wave", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If cooling can dissipate only 80% of an H100's 700W TDP, what sustained FP16 throughput should you expect and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~198 TFLOPS (Calculated Trap: Calculates 20% performance instead of 80%)", "989 TFLOPS (Calculated Trap: Assumes TDP throttling does not impact throughput)", "~791 TFLOPS", "560 TFLOPS (Calculated Trap: Confuses wattage limit with TFLOPS output)"], "correct_index": 2}}, {"id": "cloud-0381", "title": "The On-Node Interconnect Hierarchy", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is a transfer over the PCIe Gen5 bus compared to a direct transfer over NVLink 4.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["PCIe is ~10x slower", "They are about the same speed", "PCIe is ~2x slower", "PCIe is ~4x slower"], "correct_index": 2}}, {"id": "cloud-0382", "title": "The 5% Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on industry standards, what is the approximate annual cost for hardware maintenance, expressed as a percentage of the initial CapEx?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1%", "5%", "20%", "33%"], "correct_index": 1}}, {"id": "cloud-0383", "title": "The First-Year TCO of a Small AI Factory", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What first-year TCO should finance budget for 10 GPUs running 24/7, including CapEx, maintenance, and PUE-adjusted power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$315,000", "~$322,300", "~$336,700", "~$8,400,000"], "correct_index": 2}}, {"id": "cloud-0384", "title": "The H100's Memory Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Using the 989 TFLOPS FP16 compute and 3.35 TB/s memory bandwidth, why is the kernel memory-bound, and what is its arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound; it has nearly a PetaFLOP of compute. (Calculated trap: Ignores Arithmetic Intensity)", "Memory-bound; its Arithmetic Intensity (~0.167 Ops/Byte) is far below the ridge point (~295 Ops/Byte). (Correct)", "Compute-bound; its Arithmetic Intensity (~295 Ops/Byte) is very high. (Calculated trap: Uses ridge point as Arithmetic Intensity)", "Memory-bound; its Arithmetic Intensity is ~0.5 Ops/Byte. (Calculated trap: Ignores write byte cost)"], "correct_index": 1}}, {"id": "cloud-0385", "title": "The Guardrail Canary Cost", "topic": "activation-memory", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much additional HBM is required on canary instances to keep both 1B-parameter FP16 guardrail models loaded?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 GB", "4 GB", "2 GB", "16 GB"], "correct_index": 2}}, {"id": "cloud-0386", "title": "The Dominant Factor in TCO", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which phase typically accounts for the largest portion of the Total Cost of Ownership (TCO)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time cost of the initial model training run.", "The cumulative cost of running the inference fleet.", "The cost of data acquisition and labeling.", "The salaries of the engineering team maintaining the model."], "correct_index": 1}}, {"id": "cloud-0387", "title": "The CapEx of an A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What initial GPU CapEx is required for a three-month shadow-serving experiment that needs 100 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$30,000", "$300,000", "$3,000,000", "$150,000"], "correct_index": 2}}, {"id": "cloud-0389", "title": "The RAG Rollout Bandwidth Bill", "topic": "compound-ai-systems", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much total data must be pulled from the container registry to the nodes for this initial canary deployment?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["400 GB", "7.5 TB", "750 GB", "700 GB"], "correct_index": 2}}, {"id": "cloud-0390", "title": "The Hidden Cost of Hardware", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on industry rules of thumb, what is the approximate annual maintenance cost for a single server GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$150", "$1,500", "$7,500", "$15,000"], "correct_index": 1}}, {"id": "cloud-0393", "title": "The Cross-Rack Communication Gap", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For cross-rack GPU communication during distributed training, what is the lowest-latency interconnect and its approximate latency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["NVLink 4.0 at ~500 ns", "PCIe Gen5 at ~1,000 ns", "InfiniBand NDR at ~5,000 ns", "Cross-country Fiber at ~40,000,000 ns"], "correct_index": 2}}, {"id": "cloud-0394", "title": "The RAG Update Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can the FP16 13B RAG model plus its 4 GB vector index fit within a 32 GB container memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["26 GB. It fits easily.", "17 GB. It fits with plenty of room.", "30 GB. It fits, but with a small margin.", "212 GB. It does not fit; it requires a much larger node."], "correct_index": 2}}, {"id": "cloud-0396", "title": "The A/B Test Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the total infrastructure cost for a one-week 50/50 experiment, assuming an H100 cloud instance costs $4.50/hour?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$540 (Trap: 5 GPUs * 24 hours * $4.50 = 1 day only)", "$2,268 (Trap: 3 GPUs * 168 hours * $4.50 = Challenger only)", "$3,024 (Trap: 4 GPUs * 168 hours * $4.50 = Assumes same size fleets)", "$3,780 (Correct)"], "correct_index": 3}}, {"id": "cloud-0398", "title": "The RAG Model Upgrade", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many Gigabytes of memory will the new 7B parameter model require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB", "28 GB", "14 GB", "112 GB"], "correct_index": 2}}, {"id": "cloud-0402", "title": "The TCO Maintenance Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What rule of thumb estimates annual hardware maintenance as a percentage of initial CapEx for a large GPU cluster?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.5%", "20%", "5%", "33%"], "correct_index": 2}}, {"id": "cloud-0403", "title": "The TCO of Privacy: Federated Learning's Compute Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If the centralized model requires 6.0 x 10^23 FLOPs to train, what is the additional compute cost the company will incur by adopting Federated Learning?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["`7.2 x 10^23` FLOPs", "`1.2 x 10^22` FLOPs", "`1.2 x 10^23` FLOPs", "`4.8 x 10^23` FLOPs"], "correct_index": 2}}, {"id": "cloud-0405", "title": "The Great Divide: Intra-Node vs. Inter-Node", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If you need to transfer a large tensor between two GPUs located within the same physical server, which interconnect is designed to provide the highest bandwidth for this specific task?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand NDR", "PCIe Gen5", "NVLink 4.0", "HBM3 Memory"], "correct_index": 2}}, {"id": "cloud-0407", "title": "The CapEx Foundation of TCO", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate unit cost for a single NVIDIA H100 GPU that you would use as the baseline for this calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$3,000", "~$300,000", "~$30,000", "~$3/hour"], "correct_index": 2}}, {"id": "cloud-0408", "title": "The TCO of an H100", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the approximate 3-year TCO for one H100, including 30k CapEx, 700W at PUE 1.2, 0.10/kWh power, and 5% maintenance?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 1}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$32,200", "~$36,340", "~$36,700", "~$2,242,000"], "correct_index": 2}}, {"id": "cloud-0409", "title": "The MatMul Roofline Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is a 4096x4096 FP16 matrix multiplication on a modern accelerator compute-bound or memory-bound, and why based on arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because accessing over 100 MB from HBM is the primary bottleneck.", "Memory-bound, because its Arithmetic Intensity is low (~8 Ops/Byte), falling below the hardware's ridge point.", "Compute-bound, because its Arithmetic Intensity (~1365 Ops/Byte) is significantly higher than the hardware's ridge point (~295 Ops/Byte).", "It's impossible to tell without knowing the GPU's cache hit rate for the operation."], "correct_index": 2}}, {"id": "cloud-0411", "title": "The TCO Iceberg: Training vs. Inference", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Over a typical 3-year production ML lifecycle, what is the approximate cost ratio of cumulative inference to initial training?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Training is 5-10x more expensive than inference.", "They are roughly equal in cost (1:1 ratio).", "Inference is 5-10x more expensive than training.", "Inference is over 100x more expensive than training."], "correct_index": 2}}, {"id": "cloud-0414", "title": "The RAG Rollout Storage Tax", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What peak vector-index storage is required during a blue-green rollout replacing a 14 GB RAG index with a new 14 GB version?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "21 GB", "28 GB", "42 GB"], "correct_index": 2}}, {"id": "cloud-0415", "title": "The Hidden Cost of Federation", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "At production scale, what is the largest and most underestimated new TCO cost introduced by adopting Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Network bandwidth costs from millions of devices sending model updates.", "Compute cost of the central server that aggregates model updates.", "Sustained engineering and operational complexity to manage the distributed fleet.", "The CapEx for 100 aggregation servers."], "correct_index": 2}}, {"id": "cloud-0416", "title": "The Annual Energy Cost of an Inference Cluster", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the approximate annual energy cost for the 10 H100 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$6,132 (Calculated Trap: Ignores the 1.1 PUE overhead)", "~$15,000 (Calculated Trap: Calculates 5% annual maintenance on CapEx)", "~$6,745", "~$67,452 (Calculated Trap: Assumes $1.00/kWh instead of $0.10/kWh)"], "correct_index": 2}}, {"id": "cloud-0417", "title": "The H100 Roofline Ridge Point: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the H100's FP16 ridge point, and what does it imply for memory- vs compute-bound kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~295,000 FLOPs/Byte. This indicates most models are memory-bound.", "~0.0034 FLOPs/Byte. This indicates most models are compute-bound.", "~295 FLOPs/Byte. Algorithms with arithmetic intensity below this are memory-bound.", "~295 FLOPs/Byte. Algorithms with arithmetic intensity above this are memory-bound."], "correct_index": 2}}, {"id": "cloud-0418", "title": "The 70B Inference Footprint", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the approximate memory needed just to load the model weights for inference in half-precision (FP16)?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 0}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["70 GB", "14 GB", "140 GB", "1120 GB"], "correct_index": 2}}, {"id": "cloud-0419", "title": "The RAG Update Memory Footprint", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the memory implications of the blue-green re-index, and how much extra storage is needed for 1M FP16 4096-d embeddings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4.1 GB", "~16.4 GB", "~8.2 GB", "~0 GB"], "correct_index": 2}}, {"id": "cloud-0421", "title": "The First-Year Cost of an H100", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the first-year TCO for one H100 running 24/7, including CapEx, maintenance, and power at $0.15/kWh with 1.1 PUE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$30,000", "~$31,500", "~$32,420", "~$32,512"], "correct_index": 3}}, {"id": "cloud-0422", "title": "The Economics of Privacy: Federated vs. Centralized Training", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "At $0.05/GB and 100 FL rounds, what are the transfer costs for centralized training versus federated learning, and which is cheaper?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$25 (FL) vs. $500,000 (Centralized)", "$10,000 (FL) vs. $500,000 (Centralized)", "$500,000 (FL) vs. $500,000 (Centralized)", "$5,000 (FL) vs. $500,000 (Centralized)"], "correct_index": 3}}, {"id": "cloud-0423", "title": "The Data Pipeline Stall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most likely bottleneck in your node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0426", "title": "The Batch Size Sweet Spot", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does throughput increase 51x with a 64x batch size increase, and at what batch size do diminishing returns begin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Throughput increases because memory bandwidth scales linearly with batch size. Diminishing returns start at batch 64.", "Throughput increases because the GPU caches the weights in SRAM. Diminishing returns start at batch 1024 when SRAM is full.", "Throughput increases because the workload shifts from compute-bound to memory-bound. Diminishing returns start at batch 128 due to KV-cache limits.", "Throughput increases by reusing weight reads from HBM, increasing arithmetic intensity. Practical diminishing returns start around batch 64 due to KV-cache VRAM limits, even though the ridge point is near batch 295."], "correct_index": 3}}, {"id": "cloud-0427", "title": "The Small Batch Anomaly", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might a powerful CPU complete batch-1 inference for a small CNN faster than a high-end GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0428", "title": "The OOM Error", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the system OOM instantly on step 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0429", "title": "The GPU Utilization Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Where did 77% of your GPU-hours go, and what is the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0430", "title": "The Token Throughput Estimate", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "During autoregressive decoding at batch=1, roughly how many tokens/sec can a 70B LLM generate, and why is it memory-bandwidth bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0432", "title": "The Arithmetic Intensity Question", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is an H100 showing only 15% of peak TFLOPS not necessarily broken?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0434", "title": "The Mid-Afternoon Throttling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of the daily 15-20% throughput drop when GPU power is capped around 560W at 2 PM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server's Power Supply Unit (PSU) is failing and cannot provide the full 700W to the GPU.", "The training script has a software bug that reduces computational intensity after a few hours of running.", "A 'noisy neighbor' is running on the same server, stealing CPU cycles and starving the GPU of data.", "The datacenter's ambient temperature is rising in the afternoon, reducing the GPU's thermal headroom and forcing it to throttle power."], "correct_index": 3}}, {"id": "cloud-0435", "title": "The Illusion of Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the speedup so negligible?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 0}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is now memory-bandwidth bound because loading the sparse weight indices from HBM is slower than the compute savings.", "The pruning is unstructured, which prevents the H100's Tensor Cores from accelerating the matrix math and leads to inefficient, irregular memory access.", "Kernel launch overhead now dominates the execution time, as the pruned model still requires launching the same number of CUDA kernels.", "The model has been compressed on disk, but the GPU driver is decompressing it back to a dense format in memory, nullifying the pruning."], "correct_index": 1}}, {"id": "cloud-0436", "title": "The Distillation Cost-Benefit Analysis", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the cost of distilling the 175B teacher into a 7B student on 140B tokens at 40% MFU and $2/GPU-hour, and what drives it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Around $7,700, primarily driven by the student model's backpropagation steps.", "Around $193,000, because the MFU is so low that it makes the hardware inefficient.", "Around $77,100, primarily driven by the compute required for the teacher model's forward passes.", "Around $30,000, primarily driven by the HBM memory capacity needed to hold both models."], "correct_index": 2}}, {"id": "cloud-0437", "title": "The Tale of Two Latencies", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which optimizations address the 8k-token prefill latency and the chatbot time-per-token latency for the 70B LLM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply FlashAttention to the chatbot to reduce per-token time, and Speculative Decoding to the summarizer to handle the long context.", "Apply FlashAttention to the summarizer to fix prefill latency, and Speculative Decoding to the chatbot to reduce per-token latency.", "Apply FlashAttention to both; it speeds up all attention calculations, which will fix both prefill and decoding latency.", "Neither. The issue is network latency for the chatbot and an insufficient batch size for the summarizer, not the model's architecture."], "correct_index": 1}}, {"id": "cloud-0438", "title": "The Tensor Parallelism Choke Point", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long should the 4GB all-reduce take with all 8 GPUs on NVLink versus split across two servers, and what bottleneck does this reveal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The bottleneck is PCIe Gen5 bandwidth, as the data must cross the PCIe bus taking ~220ms (calculating 7GB / 32GB/s).", "Software overhead in the NCCL communication library is the primary issue; the hardware difference between NVLink and InfiniBand is mathematically negligible.", "The bottleneck is the InfiniBand NDR link, which has ~18x less bandwidth (50 GB/s vs 900 GB/s). The inter-node transfer takes ~140ms vs ~7.8ms for the all-NVLink case.", "The bottleneck is HBM memory access latency; reading the 4 GB tensor from HBM on each GPU at 3.35TB/s is slower than the network transfer itself."], "correct_index": 2}}, {"id": "cloud-0439", "title": "The Silent Failure Cascade", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes wasted time for the 4,096-GPU, 25-day 175B LLM training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Checkpoint every 5 minutes to be safe.", "Checkpoint roughly every 18-20 minutes.", "Checkpoint once per day to minimize overhead.", "The job failed after 3 days, so checkpointing every 48 hours is sufficient."], "correct_index": 1}}, {"id": "cloud-0440", "title": "The Vision Transformer Resolution Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does doubling resolution make ViT exceed 20ms while ResNet stays within budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0441", "title": "The GPU-Bound Inference Stall", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this poor performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too small to saturate the GPU's arithmetic units.", "The PCIe Gen5 bus is a bottleneck, preventing data from reaching the GPU quickly enough.", "The system is dispatch-bound due to high kernel launch overhead from numerous small operations.", "The model is memory-bandwidth bound because it needs to read large embedding tables from HBM."], "correct_index": 2}}, {"id": "cloud-0442", "title": "The Long-Context OOM Failure", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given that the model parameters (140GB in FP16, distributed via Tensor Parallelism) and optimizer states fit in memory, what is the most likely cause of this sudden OOM error?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Adam optimizer states have doubled in size due to the longer sequence.", "The full (N, N) attention score matrix is being materialized in HBM, which scales quadratically.", "The KV-cache for the 8192-length context is too large to fit in memory.", "The gradient checkpointing buffer is overflowing with the larger activation sizes."], "correct_index": 1}}, {"id": "cloud-0444", "title": "The Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause for these diminishing returns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large, causing PCIe bus saturation as data is swapped to system RAM.", "The All-Reduce collective operation is saturating the inter-node InfiniBand network.", "The training workload is compute-bound, and we have hit maximum TFLOPS.", "NVLink bandwidth is insufficient for the amount of intra-node gradient sharing required."], "correct_index": 1}}, {"id": "cloud-0445", "title": "The Embedding Lookup Lag", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which bus protocol is the critical path and what is its approximate latency for a single lookup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand RDMA, with a latency of ~5,000 ns per lookup. (Calculated trap: Assumes inter-node routing)", "PCIe Gen5, with a latency of ~1,000 ns per lookup. (Calculated trap: Assumes non-NVLink traversal)", "NVLink, with a combined bus and remote HBM access latency of ~800 ns per lookup. (Correct)", "HBM3 memory bandwidth. (Calculated trap: Assumes bandwidth-bound instead of latency-bound)"], "correct_index": 2}}, {"id": "cloud-0446", "title": "The Silent GPU Killer", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most likely cause of this failure, and what should be your immediate action?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The training code has entered an infinite loop, causing the application to hang. You should attach a debugger to the Python process.", "The data loading pipeline is stuck, starving the GPU and freezing the process. You should investigate the data loader and network performance.", "A transient GPU hardware fault occurred, confirmed by the `dmesg` error. You should ensure the job automatically restarts from its last checkpoint on a different node.", "The periodic checkpointing process is hanging while writing to the file system, which freezes the training loop. You should investigate the storage system's health."], "correct_index": 2}}, {"id": "cloud-0447", "title": "The CISO vs. The CFO: Federated TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using ALE, should the bank choose a 650k centralized 70B training run or federated learning if centralization raises 100M breach risk from 1% to 5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The $650,000 cost per training run is the dominant factor, as frequent retraining could exceed millions per year.", "The engineering cost of building and maintaining a complex Federated Learning pipeline will be the highest cost.", "The $4M increase in Annualized Loss Expectancy from centralizing the data is the most significant financial factor.", "The network egress cost to transfer petabytes of data from partner banks to the central cloud will be the largest one-time expense."], "correct_index": 2}}, {"id": "cloud-0448", "title": "The Satellite Imagery Scaling Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 4096x4096 satellite imagery, should you choose a ConvNet or a standard ViT, and how large is the scaling gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT, as it is a more modern architecture that generally outperforms ConvNets, indicating superior feature learning capabilities.", "Both are comparable; since they have a similar number of parameters, their inference FLOPs and serving cost will be roughly the same.", "The ConvNet, because the ViT's attention mechanism scales quadratically with the number of patches, leading to an intractable explosion in compute at high resolutions.", "The ConvNet, because it will have higher arithmetic intensity and better saturate the GPU's memory bandwidth compared to the ViT."], "correct_index": 2}}, {"id": "cloud-0450", "title": "The PCIe Starvation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most probable architectural bottleneck causing the GPU to starve?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVMe SSD array cannot provide data fast enough to the CPU.", "The bandwidth of the PCIe Gen5 bus is insufficient to keep the GPU's memory fed with data.", "The number of CPU workers in the DataLoader is too low, causing a preprocessing bottleneck.", "The GPU's L2 cache is too small, causing frequent, slow misses to HBM."], "correct_index": 1}}, {"id": "cloud-0451", "title": "The Privacy-TCO Trade-off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year TCOs for centralized versus federated training across 10 hospitals with 50TB each, and which should you recommend?", "chain_ids": ["cloud-chain-auto-016-09"], "chain_positions": {"cloud-chain-auto-016-09": 2}, "chain_tiers": {"cloud-chain-auto-016-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper because its one-time compute cost ($80k) is far less than the federated hardware cost ($800k).", "Federated is cheaper primarily because it avoids the massive recurring cost of the specialized compliance team and long-term cloud storage.", "Centralized is cheaper because the data egress cost for federated learning ($1,500) will grow to be the largest expense over time.", "They are roughly equivalent in cost; the higher hardware CapEx of the federated approach is offset by the higher compute cost of the centralized one."], "correct_index": 1}}, {"id": "cloud-0452", "title": "The Startup's Scaling Dilemma", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 1T tokens and 1M H100-hours, should the startup train the 2B model or the 25B model under Chinchilla scaling?", "chain_ids": ["cloud-chain-auto-secondary-013-03"], "chain_positions": {"cloud-chain-auto-secondary-013-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 25B model, because its dense matrix multiplications will achieve higher MFU on H100s, making more efficient use of the grant.", "The 25B model, because it is closer to the data-optimal size for 1T tokens and easily fits within the compute budget.", "The 2B model, because the 25B model is too large for the 1T token dataset, making it data-constrained and leading to wasted compute.", "The 2B model, because smaller models require fewer FLOPs per parameter, allowing us to train for more epochs."], "correct_index": 1}}, {"id": "cloud-0453", "title": "The Single-Node Slowdown", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the 14GB inter-GPU transfer taking about 220ms on the 8-H100 DataParallel server?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is bottlenecked on data preprocessing, starving the GPUs.", "The InfiniBand network connection to other nodes is saturated.", "The server lacks a direct NVLink bridge, forcing GPU communication over the slower PCIe bus.", "The H100 HBM3 memory bandwidth is insufficient for the model size."], "correct_index": 2}}, {"id": "cloud-0454", "title": "The Scaling Efficiency Collapse", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does switching from 400Gbps InfiniBand to 100Gbps Ethernet balloon step time from 11.3s to over 27s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cluster's storage (NVMe) is too slow for writing checkpoints at each step.", "The PCIe bus on each node is saturated from transferring data to the network card.", "The Ethernet cluster lacks RDMA, forcing slow, CPU-mediated data transfers for gradient synchronization.", "The 4x reduction in raw bandwidth strictly forces a 4x increase in total step time."], "correct_index": 2}}, {"id": "cloud-0455", "title": "The HealthTech TCO Dilemma", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which smart-reply strategy has lower annual fleet TCO after adding HIPAA breach ALE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because its direct compute and engineering costs are lower before accounting for breach-risk exposure.", "Federated, because data egress costs to upload petabytes of data from 100 hospitals would exceed the engineering overhead.", "Federated, because the Annual Loss Expectancy from a potential data breach in the centralized model makes it significantly more expensive.", "Centralized, because FL models converge slower and have lower accuracy, leading to hidden opportunity costs in product quality that outweigh the breach risk."], "correct_index": 2}}, {"id": "cloud-0456", "title": "The SLA-Driven Batching Strategy", "topic": "activation-memory", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is large static batching a poor solution for this unified service, and what scheduling approach is better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's a good solution; the chatbot users will just have to tolerate higher latency.", "It fails because large static batches can exhaust HBM, causing frequent swapping.", "It fails because high-latency chatbot requests will be starved by the throughput-focused batch jobs. A better solution is continuous batching with priority scheduling.", "It's better to build two separate physical clusters, one for each workload, to guarantee isolation."], "correct_index": 2}}, {"id": "cloud-0457", "title": "The Intra-Node Scaling Failure", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does scaling from 4 to 8 H100s in one server yield only a 1.3x speedup when all-reduce is slow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the server to storage is saturated.", "The GPUs are communicating over the PCIe bus instead of the NVLink fabric.", "The HBM3 memory on each GPU doesn't have enough bandwidth to handle the gradients.", "The ring all-reduce algorithm is inefficient and should be replaced with a tree all-reduce."], "correct_index": 1}}, {"id": "cloud-0458", "title": "The Mysterious Multi-Node Slowdown", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause for this drastic inter-node communication slowdown?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The intra-node NVLink bandwidth is insufficient to feed the InfiniBand NIC. (NVLink trap)", "RDMA has failed, forcing communication to use a slow, CPU-bound IP-over-InfiniBand fallback.", "The 400 Gbps InfiniBand switch lacks the capacity for this model size. (Raw BW trap)", "The PCIe bus connecting the InfiniBand NIC to the motherboard is saturated. (PCIe trap)"], "correct_index": 1}}, {"id": "cloud-0459", "title": "The Hospital TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Over three years, which option has lower TCO for quarterly medical-imaging retraining: centralized cloud training or federated on-prem training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Path A is cheaper because the hospital avoids the large $500,000 upfront server cost (CapEx).", "Path B is cheaper because the 3-year operational costs are significantly lower than the recurring cloud rental fees, easily justifying the initial CapEx.", "Path B is more expensive because the cost of electricity and maintenance for 10 servers over 3 years exceeds the cost of renting GPUs.", "The costs are roughly equivalent, so the decision should be based purely on data privacy concerns, not economics."], "correct_index": 1}}, {"id": "cloud-0460", "title": "The Real-Time Voice Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can the hardware meet the 350ms TTFT target for a 200-token prompt on a 7B LLM, and what are the prefill and first-decode times?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the 7B model's prefill computation will take over 500ms.", "Yes, the total TTFT including overheads and GPU processing is well under 100ms.", "Maybe, it depends entirely on the batch size the server is currently processing.", "No, reading 14GB of model weights from HBM will violate the 350ms deadline."], "correct_index": 1}}, {"id": "cloud-0461", "title": "The Two-Node Scaling Disappointment", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely physical bottleneck causing the scaling efficiency to drop off between two 8xH100 nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus on each server is saturated from transferring data to the network card.", "The chosen ring-based All-Reduce algorithm is inefficient for a two-node setup.", "The physical bandwidth of the inter-node InfiniBand connection is ~18x lower than the intra-node NVLink fabric.", "There is not enough HBM3 memory on the GPUs to store the gradients before communication."], "correct_index": 2}}, {"id": "cloud-0462", "title": "The Federated TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using TCO including the 1% chance of a $50M breach, should the fintech choose centralized training or federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Federated Learning, because the $50M potential fine is an unacceptable business risk that outweighs any calculated TCO difference.", "Federated Learning, because its TCO of $1,800,000 provides total insurance against a $50M fine. The engineering cost is a one-time capital expense and shouldn't be in the TCO calculation.", "Centralized Training, because its TCO of $810,000 is significantly lower than the Federated Learning TCO of $1,800,000.", "Centralized Training, as its TCO is lower. The primary cost driver for Federated Learning is the high energy consumption from on-device training across millions of phones."], "correct_index": 2}}, {"id": "cloud-0463", "title": "The Phantom PCIe Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of this All-Reduce bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the server to the storage cluster is saturated with checkpointing traffic.", "The server's CPU is too slow to orchestrate the high-frequency gradient exchange between the 8 GPUs.", "The communication library (NCCL) is misconfigured and is routing traffic over the PCIe bus instead of NVLink.", "The HBM3 memory on the GPUs is too slow to read the gradients before the `All-Reduce` operation can begin."], "correct_index": 2}}, {"id": "cloud-0465", "title": "The Privacy Premium", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year TCOs for centralized versus federated training across 10 hospitals, and which strategy would you recommend?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Federated is cheaper. The $72,000 annual compute cost for Centralized training over 3 years is more than the cost of maintaining the FL system.", "Centralized is far more expensive. The data transfer cost for 100TB of data is over $1M alone, making it prohibitive.", "Centralized TCO is ~$944K and Federated TCO is ~$1.42M. Centralized is cheaper financially, but the recommendation must explicitly weigh the medical-data privacy and breach risk.", "The TCOs are comparable. Centralized is ~$1.4M, while Federated is ~$1.5M. We should choose Centralized to get better model performance."], "correct_index": 2}}, {"id": "cloud-0466", "title": "The A/B Test TCO Trap", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can you demonstrate which option has the lower Total Cost of Ownership (TCO) for this two-model training run?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0467", "title": "The Two-Node Scaling Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What hardware bottleneck explains the poor scaling when moving from 8 H100s on one node to 16 H100s across two nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU compute is saturated; the model is too small for 16 GPUs.", "The PCIe Gen5 bus connecting the GPUs to the CPU is saturated by the increased data parallelism.", "The inter-node InfiniBand connection has become the primary bottleneck for the data-parallel all-reduce.", "The intra-node NVLink bandwidth is insufficient for the 8-way tensor parallel collectives."], "correct_index": 2}}, {"id": "cloud-0468", "title": "The Multi-Node Scaling Cliff: Collective Communication", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the primary bottleneck limiting the 16-node training job's scaling efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from transferring activations between the CPU and GPU memory.", "HBM3 memory bandwidth on the GPUs is insufficient to keep the Tensor Cores fed during computation.", "The inter-node InfiniBand fabric is saturated by the 140 GB gradient All-Reduce step.", "The CPU is bottlenecking the data loading pipeline, starving all GPUs of new batches."], "correct_index": 2}}, {"id": "cloud-0470", "title": "The Tensor Parallel Scaling Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bottleneck is causing the 4GB tensor all-reduce to dominate when scaling from 8 to 16 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus connecting the GPUs to the motherboard is saturated by the increased traffic.", "The all-reduce algorithm in the framework's code is inefficient and doesn't scale beyond 8 GPUs.", "The inter-node InfiniBand network has much lower bandwidth than the intra-node NVLink fabric, creating a communication bottleneck.", "The GPUs are compute-bound and cannot keep up with the data from 16 parallel processes."], "correct_index": 2}}, {"id": "cloud-0471", "title": "The Tensor Parallelism Traffic Jam", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely diagnosis for this performance issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting to storage is saturated, delaying gradient accumulation.", "The GPUs are communicating over the PCIe bus instead of NVLink due to a system misconfiguration.", "The model's activations are too large, causing slow data movement from HBM to the streaming multiprocessors.", "The NCCL All-Reduce algorithm is misconfigured, using an inefficient Ring protocol instead of a Tree."], "correct_index": 1}}, {"id": "cloud-0475", "title": "The Inter-Node Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What primary communication bottleneck explains the 55% scaling efficiency and saturated inter-server links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from transferring gradients between the CPU and the GPUs.", "The NVLink switch within each server is overloaded by the 8-GPU all-reduce traffic.", "The InfiniBand network bandwidth between servers is insufficient for inter-node gradient synchronization.", "The server's CPU is unable to schedule the RDMA (Remote Direct Memory Access) operations fast enough."], "correct_index": 2}}, {"id": "cloud-0479", "title": "The Cross-Rack Embedding Fetch", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum latency for GPU_A to RDMA-read GPU_B’s remote embedding, and which component dominates it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The HBM3 memory access on GPU_B (~300 ns), because accessing off-chip memory is fundamentally slow.", "The request is processed by the CPU on both servers, adding significant software overhead.", "The round trip over the InfiniBand network (~8,000 ns), as it involves two cross-rack transfers.", "The speed of light delay across the datacenter floor, which is on the order of milliseconds."], "correct_index": 2}}, {"id": "cloud-0480", "title": "The Two-Node Scaling Wall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Diagnose the most likely cause of this poor scaling when the 70B LLM's gradient All-Reduce crosses the 400 Gbps InfiniBand link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVLink switch within each server is saturated by the 8-GPU All-Reduce traffic, creating a bottleneck.", "The PCIe Gen5 bus connecting the GPUs to the CPU is unable to handle the gradient data from 8 GPUs simultaneously.", "The 400 Gbps InfiniBand link between the two servers has insufficient bandwidth for the 280 GB cross-node gradient synchronization.", "TCP/IP protocol overhead on the InfiniBand network is adding too much latency to the All-Reduce operation."], "correct_index": 2}}, {"id": "cloud-0481", "title": "The Privacy vs. Churn Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which Smart Reply approach—centralized or federated—should you choose for 10 million users, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Option A (Centralized), because the $350k compute cost is much lower than the $900k engineering headcount for Option B.", "Option B (Federated), because the $1M data breach risk of the centralized model is an unacceptable business liability.", "Option A (Centralized), because the annual cost from user churn in the federated model (~$3M) is the single largest expense and far outweighs its privacy benefits.", "Option B (Federated), because it avoids centralizing PII, and the $5 LTV is too small to worry about a minor churn increase."], "correct_index": 2}}, {"id": "cloud-0482", "title": "The Multi-Node Scaling Ceiling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause limiting the scaling performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus on each node is saturated transferring data between the host CPU memory and the GPU.", "The training is compute-bound; the H100 GPUs are simply not powerful enough to handle a 175B model efficiently.", "The gradient all-reduce step is saturating the InfiniBand interconnect between nodes.", "There is a software deadlock in the NCCL communication library when using more than 8 nodes."], "correct_index": 2}}, {"id": "cloud-0486", "title": "The Two-Node Scaling Cliff: Collective Communication", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given the two servers are connected via a 400 Gbps InfiniBand NDR link, what is the most likely bottleneck causing this scaling cliff?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 2}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated transferring data between the host CPU memory and the GPUs.", "The InfiniBand network connecting the two servers has become the primary bottleneck.", "The NVLink fabric within each server is overloaded by the 16-GPU All-Reduce traffic.", "The NCCL All-Reduce algorithm is implemented inefficiently and is not optimized for 16-GPU configurations."], "correct_index": 1}}, {"id": "cloud-0488", "title": "The Cross-Node All-Reduce Stall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What primary communication bottleneck causes the 1.5x speedup when scaling the 175B model from one NVLink node to two InfiniBand nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from loading data for the next step.", "The NVLink 4.0 bandwidth within each node is insufficient for 8 H100s during the All-Reduce.", "The InfiniBand NDR network connecting the two nodes is the bottleneck.", "The model is simply too large, and the total gradient size exceeds what modern interconnects can handle efficiently."], "correct_index": 2}}, {"id": "cloud-0490", "title": "The Disappointing Scaling Factor", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the 70B data-parallel job get only 1.2x speedup when All-Reduce crosses two InfiniBand-connected H100 servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus in each server is saturated from copying gradients between the GPUs and the CPU's main memory.", "The InfiniBand network connecting the two servers has insufficient bandwidth for the 140 GB gradient synchronization required at each step.", "The NVLink switch within each 8-GPU server is the bottleneck, as it cannot handle the All-Reduce traffic from 8 H100s simultaneously.", "The model is too large. Using a smaller 7B parameter model would resolve the communication bottleneck without changing hardware."], "correct_index": 1}}, {"id": "cloud-0491", "title": "The Hospital AI Rollout: Centralized vs. Federated TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Over 3 years, which strategy is more economical for 500 hospitals refreshing 10 TB each annually, centralized or federated training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper. The 3-year TCO is $1.5M ($500k/yr * 3) vs. Federated's $1.45M, because data costs are negligible.", "Federated is cheaper. The 3-year TCO is ~$1.45M compared to the Centralized TCO of >$4.0M.", "Centralized is cheaper. The 3-year TCO is $1.6M ($500k compute + $100k ingress) vs. Federated's $1.45M, ignoring storage costs.", "Federated is cheaper, but only because it avoids the $3.6M recurring storage charge for the retained 5 PB centralized copy."], "correct_index": 1}}, {"id": "cloud-0492", "title": "The Distributed Training Stall", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bottleneck explains the 4-node AllReduce slowdown for the 175B FP16 job on 400 Gbps InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated transferring data between the host CPU memory and the GPUs.", "The on-node NVLink 4.0 interconnect is the bottleneck, as 8-way tensor parallelism is too communication-intensive.", "The 400 Gbps InfiniBand interconnect is saturated by the 350 GB gradient synchronization during the AllReduce operation.", "The training is CPU-bound, as the CPUs on each node cannot schedule the NCCL kernels for the AllReduce operation fast enough."], "correct_index": 2}}, {"id": "cloud-0494", "title": "The Tensor Parallel Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What primary bottleneck explains the 420ms step time and 90% all-reduce overhead when scaling tensor parallelism to two nodes?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 1}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The `all-reduce` operation hits the InfiniBand bandwidth limit (50 GB/s), adding ~3.6 seconds of network wait per step.", "The `all-reduce` operation hits the PCIe Gen5 bus limit (64 GB/s), adding ~2.8 seconds of network wait per step.", "The `all-reduce` operation hits the InfiniBand bandwidth limit (400 GB/s), adding ~0.45 seconds of network wait per step.", "The `all-reduce` operation hits the NVLink bandwidth limit (900 GB/s), adding ~0.2 seconds of network wait per step."], "correct_index": 0}}, {"id": "cloud-0497", "title": "Cloud vs. Federated Keyboard Model: Privacy and Churn Tradeoff", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you decide between the A/B-tested cloud and federated keyboard models when accounting for cost, privacy risk, and churn?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Ship the Cloud model. The annual cost is only $876k, which is cheaper than the $1M lost to churn from the Federated model, and the 15% engagement lift is a huge business win.", "Ship the Federated model. It has a TCO of $0 since it runs on user devices, making it infinitely cheaper than the Cloud model which costs $876k per year.", "Reject the Cloud model on privacy grounds despite its lower TCO ($876k vs $1M). The risk of a data scandal is too high. The team must optimize the Federated model's power usage before launch.", "Reject both. The Cloud model's TCO is over $8M (25 GPUs * $30,000 CapEx * 10 for infrastructure) and the Federated model loses $1M. Neither is financially viable."], "correct_index": 2}}, {"id": "cloud-0505", "title": "H100 Roofline Diagnosis for Recommendation Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is your tech lead wrong to suggest buying a faster GPU when achieving 120 TFLOPS out of 989 peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0506", "title": "The Silent Padding Tax", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does batch size 15 cause only about 11% MXU utilization on a TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0509", "title": "The Profiler Trace Puzzle", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the overall GPU compute utilization, and where should you optimize first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0511", "title": "The FSDP vs DDP Memory Trade-off", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which engineer is correct regarding the use of DDP versus FSDP for a 7B model, and what are the exact memory numbers?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 1}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0514", "title": "The Distributed Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the data loading bottleneck when scaling to 256 GPUs on a shared NFS server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0515", "title": "Diagnosing Redis Backpressure Causing Kafka Rebalance Storms", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do Redis saturation and Kafka rebalances cause the latency spike, and how do you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0516", "title": "The 100 TB Data Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the 100 TB preprocessing pipeline, and what end-to-end time should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0517", "title": "The Data Pipeline Determinism Trap", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can using multiple PyTorch DataLoader workers break reproducibility despite setting global seeds, and what is the correct fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0526", "title": "The Preemption Penalty", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is dynamic preemption via PCIe KV-cache swapping feasible for a 13B model with a 100ms P99 SLO, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0531", "title": "The Sequence Parallelism Necessity", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is tensor parallelism alone insufficient, and what additional technique do you need?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0534", "title": "Cross-Node Tensor Parallelism Bottleneck at T=16", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is T=16 likely slower than T=8, and what is the exact communication cost per transformer layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0535", "title": "The TCP/IP CPU Overhead", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the CPU doing that is bottlenecking a 100 Gbps network during the AllReduce phase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0536", "title": "The GPU Scheduling Dilemma", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you schedule these to maximize cluster utilization without starving any team?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0537", "title": "The Data Quality Pipeline", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a data quality pipeline that catches silent corruptions in the 10 TB/day training feed within 1 hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0538", "title": "The Roofline Across Precisions", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Calculate the ridge point for each precision and explain why a workload that is compute-bound at FP16 can become memory-bound at INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0539", "title": "The FlashAttention Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How can you get faster without doing less math?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0540", "title": "The NUMA Node Cross-Talk", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What physical motherboard bottleneck are you hitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0544", "title": "The Multi-Modal Prefill Stall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the high-resolution image processing so slow, and what fundamental compute trade-off is the PM missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0545", "title": "The Multi-Tenant Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is adding GPUs the wrong fix for Premium timeouts behind Standard jobs, and what architecture would prevent this priority inversion?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0546", "title": "The Speculative Decoding Trap", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does this optimization, designed to make things faster, cause a catastrophic throughput collapse under load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0547", "title": "The Speculative Backfire", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What was wrong with the speculative decoding rollout, and why does multi-turn code generation see worse time-per-token?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0548", "title": "The Power Efficiency Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why might deploying this 20% more power-efficient kernel be a catastrophic decision for your H100 fleet's throughput-oriented workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0550", "title": "The Speculative Decoding Backfire: Speculative Decoding", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an optimization that helps the average case cause a non-linear latency explosion for the worst case?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0553", "title": "The Roofline Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How is that physically possible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0554", "title": "The Amdahl Ceiling", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Where did the other 480x of our hardware investment go?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0555", "title": "Context Parallelism for 1M-Token Attention", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you distribute the sequence across GPUs, and what's the communication pattern?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0557", "title": "Adaptive Routing Caused NCCL Packet Reordering", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What did Adaptive Routing do to the packets to break NCCL?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 3}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0558", "title": "The Quantization Bias Amplifier", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How did quantization amplify a bias that barely existed in the original model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0559", "title": "The Privacy Throughput Cliff", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does adding privacy guarantees have such a devastating systems cost, and how do you bring it down to something feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0560", "title": "The New Hotness vs. The Incumbent", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Should you invest in a new cluster of Prometheus-1 chips given their 5x peak compute, 2x memory bandwidth, and 3x price premium over H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0576", "title": "The Quantization Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What platform architecture makes INT8 quantization robust to distribution shift instead of relying on one-off calibration fixes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0577", "title": "The Silent NaN Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What three-tiered platform design would make one-click INT8 quantization safe for the 70B financial LLM and future models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0578", "title": "The Blackwell Bet: A $100M GPU Upgrade Decision", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you evaluate the $100M fleet upgrade beyond headline TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0580", "title": "The Billion-Frame Quantization Strategy", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three architectural proposals, what data do you need to collect to de-risk them, and how would you prove safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0581", "title": "The Blackwell Bet: A Datacenter Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you evaluate whether B200s or more H100 capacity can actually halve the 200B world-model training time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0582", "title": "The Quantization Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What strategy would you propose to get 2x throughput from INT8 while managing outlier activations and production distribution shift?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 3}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0583", "title": "Quantizing a 105B Model on H100s With Accuracy SLOs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you formulate a quantization strategy and hardware configuration that minimizes cost while preserving the client's accuracy SLO with validation and rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0585", "title": "70B LLM INT8 Fit vs ROUGE Collapse", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three architectural proposals, and how do you justify them quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0586", "title": "The Blackwell Bet: Justifying a Datacenter Upgrade", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What data-driven framework would you use to decide whether to replace the 1,000 H100s with B200s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0587", "title": "The Silent Quantization Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What three-step plan would you propose to diagnose, mitigate, and solve the INT8 failure for power users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0588", "title": "The AI Startup's Cost of Goods Sold Crisis", "topic": "activation-memory", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What phased plan would you use to cut the 70B coding assistant's inference cost per user by 50% without noticeable quality loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0589", "title": "The Mixture-of-Experts Quantization Collapse", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design a quantization strategy that works, and what specific architectural failure explains the initial collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0590", "title": "The Catastrophic Quantization Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the most likely root cause of the INT8 collapse, how would you prove it experimentally, and what quantization strategy would you deploy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0591", "title": "The B200 Fleet Upgrade Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Would you recommend a full B200 upgrade for the fleet, or a heterogeneous fleet, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0593", "title": "The Silent Overflow Catastrophe: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What safety-first quantization pipeline would you build after the INT8 fraud LLM passes offline tests but fails on production outliers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0595", "title": "The CPU-GPU Data Transfer Tax", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What happened during that first request, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0596", "title": "The Prefetch Buffer Sizing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might setting num_workers=32 make things worse, and what's the right way to size the prefetch pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0597", "title": "The Adam Memory Multiplier", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory does the optimizer alone consume, and what fraction of VRAM does it take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0598", "title": "The Decode Bandwidth Demand", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 13B FP16 model at 8k context on an A100, how many bytes are read per decode token and what tokens/sec does that imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0600", "title": "The Stuttering Training Loop", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is a common, non-obvious reason for this low utilization and stuttering behavior?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0601", "title": "The Beam Search Memory Explosion", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does Beam Search destroy your concurrency scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0603", "title": "The PCIe Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 14 GB transfer over PCIe Gen4 x16 take, and will the cold start meet the 5-second SLA?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 0}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0607", "title": "The HBM3e Bandwidth Ceiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which workloads actually benefit from the extra bandwidth, and which don't?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0608", "title": "The NUMA Penalty", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What physical topology problem is causing the 40% drop in data loading throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0609", "title": "The Activation Recomputation Trade-off", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the overhead higher than expected?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0611", "title": "The Cache Line Waste", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What fraction of each cache line fetch is wasted, and how does this compare to row-major iteration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0612", "title": "The Gradient Checkpoint Trade-off", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 32-layer transformer, how much memory does gradient checkpointing every k layers save, and what compute overhead does it add?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 2}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0613", "title": "The Striding Stumble", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the matrix[i][j] inner-loop version 3–5x faster than matrix[j][i] despite doing the same arithmetic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0614", "title": "The Prompt Caching Optimization", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Quantify the compute and memory savings of caching the system prompt, and justify if it is worth the engineering effort?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0616", "title": "The AMD MI300X Memory Advantage", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this true, and what's the real systems impact of going from 2 GPUs to 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0617", "title": "The Leaking Inference Server", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is leaking and how do you find it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0618", "title": "The Energy-Movement Invariant", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did pruning 50% of weights halve MACs but barely reduce node energy consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0619", "title": "The Strided Memory Fetch", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does Version B run 30x slower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0620", "title": "The HBM vs SRAM Bandwidth Gap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did a seemingly small change in where a tensor is stored yield a 5x speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0621", "title": "The CXL Memory Tier", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can CXL memory avoid tensor parallelism for the 140 GB FP16 70B model, and what performance impact would it have?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 2}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0622", "title": "The NUMA Nightmare", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What technical architectural detail of modern multi-socket systems is most likely causing this, and how would you diagnose and mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0623", "title": "The Memory Dilemma", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a long-context Transformer accelerator, would you choose HBM or GDDR6, and what trade-offs justify that choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0624", "title": "The GPU Memory Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "We have 80% free memory — where did it go?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0625", "title": "The KV-Cache Fragmentation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where did the missing 19GB of VRAM go?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0626", "title": "The PagedAttention Block Size Trap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did a large block size destroy your capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0628", "title": "The Embedding Table Sharding Problem", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are the 64 GPUs 80% idle when training a 1 TB sharded embedding table if the network is not saturated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0629", "title": "The Embedding Hotspot", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What happened on GPU 14?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0630", "title": "The Bandwidth Wall", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What shared resource are they fighting over?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0631", "title": "CXL Memory for ML Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For which ML workloads does CXL memory make sense, and for which is it a trap?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 3}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0632", "title": "The Paging Paradox", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the underlying cause of this performance bottleneck, and what specific operating system feature would you leverage to mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0633", "title": "The Phantom Update", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the most likely underlying hardware-level issue causing the degradation, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0634", "title": "The Fragmentation Crisis", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is consuming our VRAM invisibly, and how do we fix it?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 4}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0635", "title": "The Gradient Checkpointing Boundary", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "With ZeRO-3 on 8x80GB GPUs, where does memory break when training a 70B model?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 4}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0636", "title": "The INT8 Throughput Advantage", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is INT8 inference always 2× faster than FP16 because the data is half the size, and when is that wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0637", "title": "The GGUF Quantization Ladder", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For Llama-3 70B on a 24 GB RTX 4090, how do Q4_K_M, Q5_K_M, and Q8_0 trade off size, throughput, quality, and fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0638", "title": "BF16 vs FP16 for Training", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do people use BF16 instead of FP16 for training if both formats are 16 bits?", "chain_ids": ["cloud-chain-auto-014-13"], "chain_positions": {"cloud-chain-auto-014-13": 0}, "chain_tiers": {"cloud-chain-auto-014-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0639", "title": "The Underflow Crisis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What numerical property is failing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0640", "title": "The Half-Baked Speedup", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's a likely technical explanation for this discrepancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0641", "title": "The Quantized Serving Accuracy Trade-off", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Walk me through the real trade-offs — when does the accuracy drop actually matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0642", "title": "The Latency Budget Breach", "topic": "mlops-lifecycle", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you approach optimizing it for this strict latency constraint without a complete re-architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0643", "title": "The Quantization Error Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For INT4 group-wise quantization with group size 128, what is the worst-case error per group and when does it become catastrophic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0644", "title": "FP8 Training Loss Spikes with BF16 Fallbacks", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do FP8 training loss spikes appear, and how do BF16 fallbacks fix them without abandoning FP8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0645", "title": "The CUDA Upgrade Regression", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can the same weights produce different accuracy with a different CUDA version?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0646", "title": "FP8 E4M3 Gradient Underflow in Transformer Training", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What numerical physics destroyed your training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0647", "title": "The FP8 Training Frontier", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does FP8 work for the large model but fail for the small one?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0648", "title": "The Quantization Noise Floor", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the SQNR for INT8 quantization of Gaussian weights with sigma=0.02, and when do additional quantization levels stop helping?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0649", "title": "The Model Compression Pipeline", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compress the 70B FP16 model to hit $0.40 per 1M tokens on one GPU while keeping quality loss under 2%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0650", "title": "The FP16 Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is happening at step 50k that wasn't happening at step 1k?", "chain_ids": ["cloud-chain-auto-014-13"], "chain_positions": {"cloud-chain-auto-014-13": 2}, "chain_tiers": {"cloud-chain-auto-014-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0651", "title": "The Precision Trade-off", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Assuming we are compute-bound, what hardware architectural detail did we forget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0652", "title": "The Fine-Tuning Estimate", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you estimate the total cost and GPU-hours required to fine-tune Llama-2-13B on a 1M-example dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0653", "title": "The Inference Batch Size Sweet Spot", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use roofline analysis to find the batch-size sweet spot for the 7B LLM on one H100 under the <500ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0654", "title": "The NCCL NVLink Deadlock", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 7-GPU topology fail and hang on a DGX machine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0655", "title": "The NVLink PCIe Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why will Vendor A's server fail catastrophically at 8-GPU Data Parallel training compared to Vendor B?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0657", "title": "The Tokenizer Overhead Spikes", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What CPU-bound process is blocking the GPU from doing its job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0658", "title": "The FP32 Fallback Penalty", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is your FP16 model running at FP32 speeds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0659", "title": "The GQA/MQA Memory Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why didn't an 8x reduction in KV-cache size yield an 8x reduction in latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0660", "title": "The MoE Memory Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 47B MoE with only 2 active experts per token OOM on a 40 GB card when a dense 13B model would easily fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0661", "title": "The PCIe ACS Block", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the CPU getting involved in a direct GPU-to-GPU transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0662", "title": "The Multi-LoRA Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you serve 100 LoRA adapters for a shared Llama-70B base model, including memory, swapping, and batching across adapters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0663", "title": "4K Visual Tokens Blow Up VLM TTFT", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where are the 7.8 extra seconds going?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0664", "title": "The CPU-Bound Generation Loop", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What part of the Python generate loop is physically preventing the GPU from running faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0665", "title": "The Attention Cost Explosion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does 32x more context cost 10x more to serve, and what are our architectural options?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0666", "title": "Cost per Token for 7B Serving on TPU v5e vs H100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why isn't choosing TPU v5e over H100 for serving a 7B model just a $/chip-hour comparison?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0667", "title": "Unstructured Sparse CUDA Kernel Slower Than Dense Matmul", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why can a 90% sparse CUDA kernel run slower than dense Tensor Core matmul?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0669", "title": "The Disaggregated Serving Architecture", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do we structurally isolate compute-bound long prompts from memory-bound token generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0670", "title": "The Decoding Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can we generate tokens faster without changing the model weights, quantizing, or losing exact mathematical accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0671", "title": "The Multi-Modal Token Starvation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why is the manager focusing on the wrong part of the stack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0672", "title": "Expert Weight Bandwidth Bottleneck in MoE Serving", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What part of MoE serving is destroying memory bandwidth?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 3}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0673", "title": "The PCIe Switch Starvation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Where is the specific hardware bottleneck starving the GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0674", "title": "The Prefill-Decode Disaggregation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why would dedicating GPUs to different phases actually improve both throughput and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0675", "title": "The Compilation Overhead", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What happened inside the framework to cause this latency drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0676", "title": "The Inference Compiler Optimization", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the real hardware-level reason fusion is so effective?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 2}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0677", "title": "The Gaudi 3 Compiler Bet", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the real systems trade-offs between hand-written CUDA kernels and Gaudi's graph compiler?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0678", "title": "The Automated Model Optimization Pipeline", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an automated model optimization pipeline that takes <1 hour and keeps quality regression under 3% on 95% of models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0679", "title": "The Stalled Data Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What's the likely culprit for the data_loader bottleneck, and how would you diagnose and mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0680", "title": "The Gradient Accumulation Equivalence", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "In what two cases does gradient accumulation with batch 64x16 stop being mathematically identical to true batch 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0681", "title": "Epoch Time Under a Data Pipeline Bottleneck", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long will one epoch take, and how does the data pipeline bottleneck impact cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0682", "title": "The Distributed Training Data Bottleneck", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the most likely bottleneck, and how would you systematically diagnose and resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0683", "title": "The ZeRO-1 Memory Squeeze", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why didn't ZeRO-1 save you enough memory?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 2}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0684", "title": "What is DDP?", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does DDP stand for and what is its primary function?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Data Driven Processing; it automatically cleans the dataset.", "Distributed Data Parallel; it copies the model to all GPUs and splits the data batch.", "Dynamic Device Partitioning; it splits the layers of a single model across GPUs.", "Distributed Data Parallel; it shards the model weights across GPUs to save memory."], "correct_index": 1}}, {"id": "cloud-0685", "title": "The All-Reduce Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If one GPU is slightly slower (a straggler), what happens to the rest of the cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The fast GPUs proceed and calculate asynchronous gradients.", "The PyTorch dispatcher automatically re-assigns the batch to a faster node.", "The entire cluster stalls at the synchronization barrier, wasting compute time.", "The cluster drops the straggler's gradients to maintain high throughput."], "correct_index": 2}}, {"id": "cloud-0687", "title": "The Optimizer Explosion", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does Adam mixed-precision training of a 30B model OOM, and which sharding strategy fixes it?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 0}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0688", "title": "The Communication Wall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why has scaling efficiency collapsed on this 512-GPU 10 Gbps Ethernet cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0690", "title": "The Memory Copy Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Based on the architecture and symptoms, what is the 'silent killer' limiting your throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0691", "title": "Long-Prompt Prefill Causing Decode Stutter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do long-prompt prefill requests make active decode streams freeze?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0692", "title": "The Memory Swiss Cheese", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is VRAM exhausted after 8 users despite each user generating only a few dozen tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0693", "title": "The Checkpoint Traffic Jam", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Based on the storage topology, what two physical bottlenecks are you hitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0695", "title": "The KV-Cache Network Wall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Based on the diagram, what physical link is destroying your latency gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0698", "title": "The Pipeline Bubble Tax", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "With P=4, M=16, and T_stage=50 ms, how do you calculate the total global-batch time, and what is it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["800 ms", "3200 ms", "1000 ms", "950 ms"], "correct_index": 3}}, {"id": "cloud-0699", "title": "The AllReduce Bottleneck", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which communication primitive is the most common scaling bottleneck when moving DDP from 8 to 128 GPUs?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 0}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The data loading (ETL) pipeline", "The optimizer step (e.g., AdamW)", "The AllReduce operation", "The forward pass computation"], "correct_index": 2}}, {"id": "cloud-0700", "title": "The FSDP Memory Calculation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Using FSDP/ZeRO-3 on 16 H100s, how much memory per GPU is needed for parameters, gradients, and Adam states for a 70B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1120 GB", "8.75 GB", "70 GB", "140 GB"], "correct_index": 2}}, {"id": "cloud-0701", "title": "The Training Cost Estimate", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the estimated H100 training cost for a 70B model on 2T tokens at $3.50/GPU-hour, and what is the biggest budget risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0702", "title": "The Phantom Performance Drop", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you quickly pinpoint if the input data for the production model is different from the training data, and what's the most common culprit?", "chain_ids": ["cloud-chain-auto-003-14"], "chain_positions": {"cloud-chain-auto-003-14": 1}, "chain_tiers": {"cloud-chain-auto-003-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0704", "title": "The Distributed Training Choke Point", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What primary bottleneck causes the 50% per-GPU throughput drop when scaling the 70B job from one 8x GPU node to two nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 140 GB gradient transfer over PCIe Gen5 adds 4.4 seconds of latency per step, saturating the bus.", "Cross-node gradient AllReduce must exchange large gradient payloads over the 50 GB/s InfiniBand link, making the off-node collective the bottleneck.", "The TCP/IP encapsulation overhead adds 2.5s of protocol serialization delay, halving throughput.", "Model parameters expand to 280 GB in distributed FP32, exceeding single-node NVLink limits."], "correct_index": 1}}, {"id": "cloud-0705", "title": "The Tensor Parallelism Scaling Trap: Collective Communication", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 1.2× scaling expected or anomalous, and what is the most likely cause of the poor scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The data transfer from host CPU memory to GPU memory over PCIe is the bottleneck.", "The RDMA protocol is adding excessive latency overhead, which accounts for the slowdown.", "The performance is expected; the bottleneck is the ~18x bandwidth gap between intra-node NVLink (900 GB/s) and inter-node InfiniBand (50 GB/s).", "One of the nodes must have a faulty NVLink switch that is slowing down the entire 16-GPU communication ring."], "correct_index": 2}}, {"id": "cloud-0707", "title": "The AllReduce Scaling Trap", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What communication bottleneck causes poor scaling when the 70B FP16 job moves from one 8×H100 node to two nodes over InfiniBand NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated because gradients must be copied to CPU RAM taking ~4.3s before being sent to the network.", "The NVLink 4.0 interconnect within each node is the bottleneck; it cannot handle the 140GB gradient exchange.", "The inter-node InfiniBand NDR network fabric is the bottleneck, as its bandwidth is much lower than the intra-node NVLink fabric.", "The CPUs on each node are overwhelmed with coordinating the RDMA transfers, starving the GPUs of instructions."], "correct_index": 2}}, {"id": "cloud-0709", "title": "Data-Parallel Scaling Across NVLink and InfiniBand", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What explains the sub-linear scaling when the job expands from one NVLink node to eight InfiniBand-connected nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0710", "title": "The Tensor Parallelism Degree", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which TP degree—2, 4, or 8—is optimal for the 70B LLM on H100 NVLink under a 40 ms per-token latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0711", "title": "The ZeRO-3 Communication Overhead", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the ZeRO-3 throughput 40% slower, and where exactly is the GPU time going?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0713", "title": "The Pipeline Bubble Cost", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much GPU-time is wasted in the pipeline bubble, and how many microbatches do you actually need to keep the bubble under 5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0714", "title": "The Heterogeneous GPU Training", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What happens when you combine A100 and H100 GPUs in a single DDP training job, and how can you use both efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0715", "title": "The Async SGD Staleness Problem", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is that the full story, and can you quantify the staleness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0716", "title": "The Pipeline Stutter (1F1B)", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What non-compute operation broke the 1F1B rhythm?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0717", "title": "The Idempotent Training Pipeline", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign this pipeline to be more fault-tolerant and cost-efficient, specifically focusing on making its stages idempotent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0718", "title": "The Straggler Log Rotation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the cron job doing that halts the entire cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0719", "title": "The Expert Parallelism Communication", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "At what point does the network become the bottleneck in this MoE training setup, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0720", "title": "Dimensioning the 3D Cube", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Given those memory, layer-count, and topology constraints, how do you allocate the dimensions for Data (D), Tensor (T), and Pipeline (P) parallelism for a 175B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0721", "title": "The 3D Parallelism Orchestration", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you assign Tensor (T), Pipeline (P), and Data (D) parallelism across 1,024 H100s for a 175B, 96-layer model, and what physical constraint justifies each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0722", "title": "The ZeRO-3 Cross-Node Thrashing", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why did a slight batch size increase cause ZeRO-3 step time to spike?", "chain_ids": ["cloud-chain-auto-025-07"], "chain_positions": {"cloud-chain-auto-025-07": 3}, "chain_tiers": {"cloud-chain-auto-025-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0724", "title": "The Heterogeneous Cluster Scheduler", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a heterogeneity-aware scheduler for a mixed-generation cluster to maximize utilization and cost efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0726", "title": "The Collective Communication Primitives", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For each collective, name a training strategy that uses it and estimate the per-GPU communication volume for a 1 GB tensor on 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0727", "title": "The All-Reduce Stalemate", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why can communication dominate for both a 1M and a 100B model on 8-node data-parallel training, but for different reasons?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0728", "title": "The AllReduce Tax", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How long does the AllReduce take, and what fraction of the training step is spent on communication?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 2}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0729", "title": "The Cross-Rack Stall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What did we misunderstand about network topology and Tensor Parallelism?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 2}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0730", "title": "512-GPU FSDP Networking: InfiniBand NDR vs RoCE v2", "topic": "compound-ai-systems", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which networking option do you choose, and when does the cheaper option break down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0731", "title": "The Topology Trap", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which common network topology might be causing this, and which would you prefer for pipeline parallelism, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0732", "title": "NCCL's Uneven Footing", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose and mitigate this inconsistent Data Parallelism performance, focusing on NCCL's behavior?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0733", "title": "The AllReduce Incast Congestion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What physical network phenomenon is causing this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0734", "title": "The NCCL Topology Misconfiguration", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is NCCL ignoring your expensive InfiniBand network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0735", "title": "The ToR Switch Buffer Microburst", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the network freezing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0736", "title": "The Ring AllReduce Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ring AllReduce degrade massively at scale despite constant per-GPU bandwidth, and what replaces it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0737", "title": "The NVLink Domain Boundary", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What physical boundary did you cross?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 3}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0738", "title": "The Oversubscription Choke", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0739", "title": "The Congestion Collapse", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What network physical phenomenon occurred?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0740", "title": "The Gradient Synchronization Overlap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Under what conditions is this true, and when does the overlap break down?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 3}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0741", "title": "The Network Congestion Collapse", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is happening to cause this throughput drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0742", "title": "The Congested Highway", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "You suspect network congestion, but how do you verify this and implement a system-level solution to ensure more predictable network performance for critical ML workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0743", "title": "Fat-Tree Inter-Pod Bisection Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural constraint in the Fat-Tree topology is causing the network efficiency to plummet for cross-pod jobs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0744", "title": "The Gradient Compression Paradox", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why won't they get anywhere near 100× improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0745", "title": "The InfiniBand Link Flap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a single flapping InfiniBand link stall all 256 GPUs in the cluster?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0746", "title": "The Bisection Bandwidth Requirement", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What minimum bisection bandwidth is needed to avoid communication bottlenecks, and is a fat-tree topology sufficient or optimal for this 1,024-GPU 3D-parallel workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0747", "title": "The Ring vs Tree Dilemma", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does the 'best' algorithm fail here?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0748", "title": "The Cross-Datacenter Training", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Is synchronous training feasible across this WAN link, and if not, what is your alternative architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0749", "title": "The Network Topology Tax", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which topology fits these workload patterns, and when is the fat-tree worth the premium?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0750", "title": "The Spot Instance Checkpoint Strategy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How often should you checkpoint to minimize wasted time, and what's the expected cost savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0751", "title": "The Straggler Problem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the total step time for the cluster, and what percentage of cluster compute is wasted due to this single node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0752", "title": "Handling Spot Preemptions with Elastic Distributed Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you handle GPUs appearing and disappearing mid-training without restarting from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0753", "title": "The Checkpoint Storage Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What's the checkpoint size, how long does the write take, and what's the impact on training throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0754", "title": "The Checkpoint Resurrection", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the PM's 30-minute recovery estimate dangerously optimistic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0755", "title": "The Straggler Mitigation Problem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What fraction of steps will have at least one straggler, and how do you mitigate this without switching to async SGD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0756", "title": "The Failure Recovery Time", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the expected failures per day and effective utilization, and what architecture gets the 10,000-GPU job above 90% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0757", "title": "Checkpointing a 1T-Parameter Training Job", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What fault-tolerance strategy would you design to minimize downtime for this 1T-parameter training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0758", "title": "The Checkpoint Serialization Freeze", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why won't faster S3 fix the 3-minute stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0759", "title": "The NFS Checkpoint Corruption", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What happened to the step-79,000 checkpoint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0760", "title": "The Silent ECC Degradation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What hardware component is silently degrading, and when does this 'slightly different' behavior become dangerously wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0761", "title": "The Optimal Checkpoint Interval", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes wasted time (checkpoint overhead plus expected lost work from failures)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0762", "title": "The MTBF Crisis", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Is saving a checkpoint for 5 minutes every hour viable at 10k GPU scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0763", "title": "The Fault-Tolerant Training Framework", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What fault-tolerant training framework would achieve >95% effective utilization with 2,048 GPUs and a cluster MTBF under 30 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0764", "title": "The Silent Data Corruption at Scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What could cause a model to silently underperform, and how would you detect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0765", "title": "The Cosmic Ray Divergence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How can a cosmic ray bit flip corrupt a model without crashing it, and how do you find which of the 70 billion parameters is wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0766", "title": "The Split-Brain Checkpoint", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you recover without losing more than 1,000 steps of work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0767", "title": "The Warmup Learning Rate Schedule", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the systems-level reason warmup is physically necessary for large-batch training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0768", "title": "The DDP Bucket Straggler", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why isn't the DDP communication overlap hiding the 200ms delay of throttled GPU 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0769", "title": "The Bad Batch Spike", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What causes the loss spike, and how do you prove it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0771", "title": "The Data Parallel Straggler", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "By exactly how much does this single degraded node slow down the entire 256-GPU training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0772", "title": "The NCCL Timeout", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the random NCCL timeout hangs across different ranks, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0773", "title": "The Gradient Overflow", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What's happening on those 3 GPUs, and why does it infect the entire cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0775", "title": "The Reproducibility Paradox", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Where are the hidden sources of non-determinism, and what does it cost to eliminate them on a modern GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0776", "title": "The Global Model", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you train a single foundation model across US, EU, and APAC without moving raw regional data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0777", "title": "Multi-modal Candidate Generation at Billion-Scale", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design the embedding architecture, index, and serving infrastructure to support multi-modal retrieval at this scale for Instagram Reels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0778", "title": "Real-Time Click Prediction with Continual Learning", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the streaming ingestion and distributed training pipeline to update CTR weights within 5 minutes while correcting for delayed clicks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0779", "title": "3D Parallelism for MFU and DDP Communication", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect the distributed training strategy (3D parallelism) and specifically optimize the DDP communication overhead to maximize Model Flops Utilization (MFU)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0780", "title": "Global Scale Real-Time Two-Tower Recommendation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you decouple item embeddings from the TPU serving path while meeting sub-50ms p99 latency for a 50B-candidate two-tower recommender?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0781", "title": "Multi-Turn Gemini LLM Serving with PagedAttention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design TPU v5p serving for 1M-token chats to reduce KV-cache fragmentation while meeting a 2-second TTFT SLA?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 3}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0782", "title": "Trillion-Parameter MoE Training on TPU Torus Topology", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you map 3D plus expert parallelism for a 2T MoE on 10,000+ TPU v5p chips to avoid OCS bottlenecks and reach 45% MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0783", "title": "The GPU Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the energy cost for this training run, and how does it compare to a cloud compute cost of $3.50/GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0784", "title": "The Container Bloat", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the primary culprits for such a large image, and how would you systematically reduce it to fix cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0785", "title": "The Unresponsive Replica", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What's likely going wrong with our health check strategy and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0788", "title": "The Thermal Memory Wall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What thermal component are you failing to monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0789", "title": "The Attention Skew", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does an evenly chunked sequence cause an asymmetric memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0790", "title": "The Compilation Wall", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is killing your performance and causing the 48-hour start time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0791", "title": "The Root Complex Choke", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Based on the PCIe topology, why didn't adding more NVMe drives fix the IO bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0792", "title": "The Memory Illusion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the throughput reality of this unified-memory design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0793", "title": "Spot Preemption Deletes Local NVMe Dataset", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the 1 TB dataset missing after the spot instance is preempted and training resumes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0794", "title": "The Silent Regression", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you have prevented this silent regression and what deployment strategy would you advocate for future model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0795", "title": "The Silent Failure", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the high GPU utilization mask the failure, and how can the hardware be healthy while the ML output is perfectly wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0796", "title": "The Training-Serving Skew", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do the different hardware paths cause this numerical divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0797", "title": "The Model Deprecation Cliff", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does maintaining multiple model generations simultaneously fragment your serving cluster and destroy GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0798", "title": "The LLM Evaluation Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might this evaluation be dangerously misleading, and how could deploying the better model destroy serving economics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0799", "title": "Optimizing Retraining Cadence from Drift and GPU Cost", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design an automated retraining policy that uses the cost optimum as a trigger, while accounting for validation gates, GPU quota, and rollback risk?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 4}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0800", "title": "The Stale Feature Store", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural flaw causes online fraud model degradation after offline feature pushes, and how would you keep features fresh and consistent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0801", "title": "Fuzzy Deduplication Economics for 10TB Pretraining Data", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Who is right, and what is the economically optimal deduplication strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0802", "title": "The Silent Schema Shift", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you prevent silent schema shifts in a petabyte-scale ML pipeline from degrading downstream models?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 3}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0803", "title": "The PII-Sensitive Training Dilemma", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which privacy-preserving training approaches would you use for regulated PII data, and how do their utility, complexity, and cost trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0804", "title": "The Exploding Data Lake Bill", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you cut storage costs for a 500 PB S3 data lake while preserving availability for ML workloads and compliance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0806", "title": "The PUE Dollar Cost", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the $5M liquid-cooling retrofit worth it for reducing PUE from 1.4 to 1.1, and what is the payback period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0808", "title": "The Thermal Throttling Mystery", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the identical Phoenix cluster training 30% slower than the Oregon cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0809", "title": "The Spot Instance Gamble", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the true expected cost of spot training considering preemption rates, and when does on-demand actually become cheaper?", "chain_ids": ["cloud-chain-auto-018-03"], "chain_positions": {"cloud-chain-auto-018-03": 0}, "chain_tiers": {"cloud-chain-auto-018-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0810", "title": "The Energy Bill", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What were the energy consumption and carbon footprint of the 30-day 256× H100 training run in a US data center?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0811", "title": "The Energy Economics", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why is the $100M figure severely underestimating the budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0812", "title": "The Carbon-Aware Scheduler", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a carbon-aware scheduler for the 70B run, and do the numbers prove carbon offsets are cheaper than moving workloads?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 3}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0813", "title": "The Carbon-Neutral Training Scheduler", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule 500 weekly training jobs across Virginia, Oregon, and Ireland to minimize carbon within 15% of baseline cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0814", "title": "The Floating Point 32 Checkpoint Tax", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is taking up the extra 280 GB, and can you delete it before deploying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0815", "title": "The Model Distillation Economics", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should we do the 2-week 32x A100 distillation to replace the $180k/month 70B service with a 7B A10G student?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0816", "title": "The S3 Data Wall", "topic": "data-efficiency-selection", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did streaming from object storage starve your compute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0817", "title": "The Parquet Row Group Chunking", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why didn't the columnar format save you bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0818", "title": "The Data Gravity Gravity Well", "topic": "mlops-lifecycle", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you engineer the training pipeline to connect the data to the GPUs, and what is the hidden economic catastrophe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0819", "title": "The Transformer Weight Footprint", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much VRAM do the weights alone consume, and does the model fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 7 GB (INT8 Trap)", "B) 14 GB", "C) 28 GB (FP32 Trap)", "D) 56 GB (AdamW State Trap)"], "correct_index": 1}}, {"id": "cloud-0820", "title": "The Tokenizer Throughput Ceiling", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Given the measured H100 serving rate, roughly how many tokens per second must the CPU tokenizer produce to avoid becoming the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~700 tokens/sec", "B) ~7,000 tokens/sec", "C) ~70,000 tokens/sec", "D) ~700,000 tokens/sec"], "correct_index": 2}}, {"id": "cloud-0821", "title": "The PCIe Transfer Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How long is the one-way host-to-device transfer for one batch over PCIe Gen5 x16 at 64 GB/s?", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 0}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~4.8 ms", "B) ~19.2 ms", "C) ~154 ms", "D) ~76 ms"], "correct_index": 1}}, {"id": "cloud-0822", "title": "The Kernel Fusion Memory Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much HBM traffic does fusion eliminate for this tensor?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 0}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~10% of HBM traffic", "B) ~25% of HBM traffic", "C) ~50% of HBM traffic", "D) ~90% of HBM traffic"], "correct_index": 2}}, {"id": "cloud-0823", "title": "The Gradient AllReduce Time", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How long does the AllReduce synchronization take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~11 ms", "B) ~54 ms", "C) ~109 ms", "D) ~1,090 ms"], "correct_index": 2}}, {"id": "cloud-0824", "title": "The ECC Bit Error Reality", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Assuming a raw bit error rate of 1e-15 per bit per hour (before ECC correction), roughly how many uncorrected bit errors would you expect per hour without ECC?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 0}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~0 errors (bits never flip)", "B) ~0.00064 errors/hour", "C) ~6.4 errors/hour", "D) ~640 errors/hour"], "correct_index": 1}}, {"id": "cloud-0825", "title": "The Checkpoint Size Math", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the total checkpoint size including weights (FP16), Adam optimizer states (FP32 momentum + variance), and gradients (FP32)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 26 GB", "B) 78 GB", "C) 182 GB", "D) 364 GB"], "correct_index": 2}}, {"id": "cloud-0826", "title": "The MoE Sparse Activation Ratio", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the effective compute cost per token of this top-2 MoE compared with a dense 47B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100% (94 GFLOPs) - Assuming all parameters are active for every token.", "~36% (34 GFLOPs) - Only backbone + top-2 experts activate.", "~21% (20 GFLOPs) - Forgetting to include the 7B shared backbone.", "~25% (24 GFLOPs) - Assuming top-2 routing without multiplying the 5B expert size by 2."], "correct_index": 1}}, {"id": "cloud-0827", "title": "The Cross-Node AllReduce Cost", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the estimated ring AllReduce time, assuming cross-node InfiniBand is the bottleneck?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 1}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~50 ms", "B) ~200 ms", "C) ~2,000 ms", "D) ~20,000 ms"], "correct_index": 2}}, {"id": "cloud-0828", "title": "The NVLink vs PCIe Tensor Parallel Gap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the per-layer communication time over NVLink 4.0 (450 GB/s) versus PCIe Gen5 x16 (63 GB/s) for batch=32, seq=2048, hidden=4096 in FP16, and is PCIe tensor parallelism viable?", "chain_ids": ["cloud-chain-auto-002-03"], "chain_positions": {"cloud-chain-auto-002-03": 0}, "chain_tiers": {"cloud-chain-auto-002-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) NVLink is ~2x faster — minor difference", "B) NVLink is ~5x faster — noticeable but manageable", "C) NVLink is ~7x faster — PCIe makes tensor parallelism impractical", "D) NVLink is ~100x faster — different technology class"], "correct_index": 2}}, {"id": "cloud-0829", "title": "The Flash Attention Memory Cliff", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much HBM does standard attention need to materialize the attention-probability matrix, versus the extra score-tile workspace FlashAttention-2 needs when using 128-token tiles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0830", "title": "The Cluster MTBF Math", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How often should you expect a GPU failure to interrupt training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Once per month — 10,000-hour MTBF is very reliable", "B) Once per week — failures are uncommon", "C) Once every ~10 hours — cluster reliability compounds", "D) Once per hour — GPUs are inherently unreliable"], "correct_index": 2}}, {"id": "cloud-0831", "title": "The Async Checkpoint Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "A synchronous checkpoint would stall training for how long, and how does async checkpointing fix this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~2 seconds — NVMe is fast enough", "B) ~23 seconds — manageable with less frequent checkpoints", "C) ~233 seconds — catastrophic without async checkpointing", "D) ~2,333 seconds — checkpointing is practically impossible"], "correct_index": 2}}, {"id": "cloud-0832", "title": "The Datacenter PUE Cooling Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the total facility power, annual electricity cost, and non-GPU overhead for the 10,000-H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0833", "title": "The TCO Per Token Analysis", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At what sustained daily token demand does building on-prem inference capacity become more cost-effective than a $0.50 per million token cloud API?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 3}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0834", "title": "The Spot Instance Training Economics", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Are spot instances worth the operational complexity for this 1,000 GPU-hour fine-tuning job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0835", "title": "The Carbon-Aware Scheduling Tradeoff", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is carbon-aware scheduling to the renewable region worth the 20% capacity and 15% data-latency tradeoff for a 1,000 GPU-hour training job?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 2}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0837", "title": "A/B Testing Survivorship Bias via Latency Timeouts", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What root cause in the system logs and A/B telemetry explains Model B's higher reported CVR but 12% lower revenue?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0838", "title": "Recommender Migration Latency Degradation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should they analyze the profile traces to root-cause this anomaly across the hardware spectrum?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0839", "title": "Evaluating A/B Test Trade-offs for Heavy Models", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 20% two-week A/B test proceed as requested, or is an alternative architecture needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0841", "title": "High-Resolution ViT OOM Analysis", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bottleneck, and what compute-based trade-off would fit the 1024x1024 ViT in 80GB without CPU offloading?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0842", "title": "Evaluating Activation Checkpointing Trade-offs for LLMs", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use full or selective activation checkpointing to reduce the 105GB per-GPU footprint, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0843", "title": "Activation Sparsity Memory Compression", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the exact compressed activation size in MB after storing a 1-bit bitmask and the non-zero FP32 values?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0844", "title": "Diagnosing Latency Spikes after GELU Replacement", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did replacing ReLU with GELU spike token latency from 75ms to 140ms and increase server power by 40%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0845", "title": "Calculate Activation Checkpointing Memory Savings", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the peak activation memory with layer-boundary checkpointing, and will this training step fit on an 80GB A100?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 0}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0846", "title": "Diagnosing AOC Link Flaps and Tail Latency", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this performance degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0847", "title": "Cascading Failures in Active-Active Inference APIs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did losing one AZ take down the whole API, and what capacity-planning flaw caused the cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0848", "title": "Sizing Multi-Region Active-Active LLM Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many total nodes are needed across two regions for zero-downtime failover, and what is normal peak utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0849", "title": "Global Active-Active Evaluation for Fraud Detection", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the cost and latency trade-offs of using global Active-Active versus local Active-Passive to survive one region failure under a 100ms p99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0850", "title": "AOC Power and Latency Overhead Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total continuous power overhead of the AOC fabric and the one-way latency penalty for a cross-rack GPU-to-GPU message compared to a local DAC-only route?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0851", "title": "Adversarial Training Compute Overhead", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much additional compute time per epoch does FGSM adversarial training add for the 5-million-image ResNet-50 dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0852", "title": "MoE All-to-All Network Load Imbalance", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause, and how do you resolve the bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0853", "title": "Routing MoE All-to-All Bursts at Scale", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is ECMP on a non-blocking Fat-Tree sufficient for 32,000-GPU MoE All-to-All bursts, or should you use Adaptive Routing?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0854", "title": "Evaluating Adversarial Debiasing Dynamics in Credit Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did the GRL adversarial debiasing setup hurt throughput and AUC, and how would you redesign it?", "chain_ids": ["cloud-chain-auto-secondary-014-30"], "chain_positions": {"cloud-chain-auto-secondary-014-30": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0855", "title": "Diagnosing Adversarial Debiasing Instability in NLP", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the GRL debiasing training to plateau main loss at 0.85 while the adversary falls to random accuracy?", "chain_ids": ["cloud-chain-auto-secondary-014-30"], "chain_positions": {"cloud-chain-auto-secondary-014-30": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0856", "title": "Diagnosing Moderation Evasion Attacks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this failure and what is the root cause?", "chain_ids": ["cloud-chain-auto-secondary-015-24"], "chain_positions": {"cloud-chain-auto-secondary-015-24": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0857", "title": "Sizing RoCEv2 Buffers for Adaptive Routing", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum path latency skew the 16MB reassembly buffer can tolerate at 800 Gbps line rate?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0858", "title": "E-commerce Recommendation Alignment Gap", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you modify the ranking objective and architecture to stop optimizing cheap clicks and recover revenue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0860", "title": "Evaluating Defenses for High-Throughput Content Moderation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defense should you deploy under a 50ms SLA and a tight training budget?", "chain_ids": ["cloud-chain-auto-secondary-015-24"], "chain_positions": {"cloud-chain-auto-secondary-015-24": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0861", "title": "CTR Versus Watch-Time Alignment Failure", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the retention and watch-time drop, and how should the ranking model be realigned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0862", "title": "Diagnosing FSDP AllGather Topology Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and root-cause this collective communication stall?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 2}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0863", "title": "Evaluating Proxy Misalignment in E-Commerce Ranking", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you realign the recommender with GMV while keeping P99 serving latency under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0864", "title": "Ring-AllReduce Transfer Time Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum Ring-AllReduce time to synchronize the 10B FP16 gradients across the 8 GPUs?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 1}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0865", "title": "Ring AllGather Time in FSDP", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum time for the ring AllGather to reconstruct the 2.4GB block across 8 GPUs?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 1}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0866", "title": "Flat vs Hierarchical AllGather", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use flat or hierarchical AllGather for FSDP weight reconstruction across 512 GPUs, and what latency difference drives that choice?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 3}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0867", "title": "MoE AllToAll Communication Time", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum theoretical time for the NVSwitch AllToAll when each A100 sends 1.5GB to each of 7 peers?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 0}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0868", "title": "Evaluating Flat vs. Hierarchical AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which AllReduce topology minimizes latency for a 2GB gradient payload on 1024 A100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0869", "title": "Diagnosing MoE AllToAll Network Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this bottleneck during the AllToAll phase, and how do you mitigate it?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 1}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0870", "title": "Diagnosing Inter-Node AllReduce Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the actual bottleneck causing the 320ms AllReduce latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0871", "title": "Hardware Upgrade Speedup Estimation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the new training step time and overall speedup after moving to H100s with 4x GPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0872", "title": "Evaluating H100 Upgrades for Hybrid Recommendation Models", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is the 8x H100 upgrade justified given the 3x GPU speedup, unchanged CPU/network time, and 4x hourly cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0873", "title": "API Cost and Rate Limits for Model Theft", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the attacker's total cost and sustained RPS over 7 days, and why do burst rate limits fail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0874", "title": "Diagnosing H100 Pipeline Bottlenecks", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the hardware migration deliver only 1.4x end-to-end speedup instead of the 3.2x FP16 TFLOPS gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0875", "title": "Diagnosing Systematic Probing for LLM Extraction", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the 100-query/min user conducting a viable model extraction attack, and how much information leaks via full logprobs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0876", "title": "Diagnosing GPU Starvation in Vision Pipelines", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is bottlenecking the ViT-H training loop, and how would you size the prefetching pipeline to eliminate GPU bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0877", "title": "Sizing Dataloader Workers and Prefetch Batches", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many CPU dataloader workers and prefetched batches are needed to match the 80ms GPU step and hide 500ms disk spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0878", "title": "Evaluating API Defenses Against Model Extraction", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defense should you choose against the 2M-query extraction attack, and how do efficacy, revenue, and utility trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0879", "title": "Evaluating Async Prefetching for H100 Starvation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the trade-offs between CPU workers, offline preprocessing, and async GPU decoding (DALI) to stop H100 starvation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0880", "title": "Dynamic Autograd Tape vs Static Compilation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you wrap the dynamic GNN training step in torch.compile/XLA, or keep eager autograd, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0881", "title": "Autograd Tape Memory Footprint in Dynamic RNNs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much autograd-tape memory do the 15,000 saved FP32 RNN tensors consume, and does it exceed the 10GB activation budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0882", "title": "Diagnosing Autograd Tape Memory Leaks in Gradient Accumulation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does gradient accumulation OOM at micro-batch 14, and how should the training loop use autograd to avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0883", "title": "Autograd Activation Memory vs Recomputation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory does storing H consume per layer, and how much would recomputing H in a custom backward save?", "chain_ids": ["cloud-chain-auto-008-08"], "chain_positions": {"cloud-chain-auto-008-08": 0}, "chain_tiers": {"cloud-chain-auto-008-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0884", "title": "Forward vs. Reverse Mode AD", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you switch the Jacobian computation to forward-mode AD, and what latency should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0885", "title": "KV Cache Memory Bandwidth Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing memory bandwidth saturation during 70B autoregressive decoding, and which component should you redesign?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0886", "title": "PagedAttention vs GQA for KV Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you prioritize PagedAttention or an 8x-KV-head GQA model to maximize 4096-token decoding throughput, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0887", "title": "KV Cache Capacity and Bandwidth Limits", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the KV cache size per token, maximum 2048-token batch size, and total memory read per decoding step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0888", "title": "Estimating Backdoor Poisoning Rate", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many 5x5-triggered poisoned images must the attackers inject to appear in at least 50% of 8192-image batches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0889", "title": "Debugging Autograd Memory Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the backward pass OOM with T=1000 activations, and how can you train without reducing sequence length or batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0890", "title": "Evaluating Backdoor Mitigations in Cloud Code Gen", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use activation-clustering sanitization or inference-time prompt perturbation to neutralize the docstring backdoor, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0891", "title": "Diagnosing Targeted Triggers in Cloud Vision APIs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose whether this is a backdoor data poisoning attack versus a natural adversarial example or a dataset bias, and what specific metrics do you use to isolate the trigger?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0892", "title": "Multi-AZ Link Buffer Exhaustion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What in-flight data is needed to saturate one 400Gbps cross-AZ link over 50km, and can 64MB-buffer ToR switches support it?", "chain_ids": ["cloud-chain-auto-027-05"], "chain_positions": {"cloud-chain-auto-027-05": 0}, "chain_tiers": {"cloud-chain-auto-027-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0893", "title": "Geo-Distributed Training Throughput Collapse", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 400Gbps inter-AZ link capped near 32Gbps, and what buffer or communication redesign is required?", "chain_ids": ["cloud-chain-auto-027-05"], "chain_positions": {"cloud-chain-auto-027-05": 1}, "chain_tiers": {"cloud-chain-auto-027-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0894", "title": "Evaluating Compute Upgrades for LLM Decoding", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will doubling TFLOPs while keeping 2.0TB/s bandwidth halve batch-1 token latency, and what should you do instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0895", "title": "LLM Decoding Throughput on A100", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What speedup should you expect from doubling TFLOPS but keeping 2.0 TB/s bandwidth for batch-1 7B decoding, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0896", "title": "Diagnosing Low SM Utilization in LLM Decoding", "topic": "extreme-quantization", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is upgrading the batch-1 7B LLM service to an H100 a good way to cut latency to 2ms, and what should you do instead?", "chain_ids": ["cloud-chain-auto-secondary-011-24"], "chain_positions": {"cloud-chain-auto-secondary-011-24": 0}, "chain_tiers": {"cloud-chain-auto-secondary-011-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0897", "title": "Bandwidth Taper in DLRM Embeddings", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum time to fetch 1.6GB of embeddings over PCIe Gen4 x16, and how does it compare with HBM2e-resident data?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 1}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0898", "title": "Diagnosing Multi-GPU Pipeline Stalls", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bandwidth bottleneck is starving the GPUs despite 600 GB/s internal bandwidth, and how do you quantify it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0899", "title": "Evaluating Parallelism Mapping Across the Bandwidth Taper", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should tensor parallelism span multiple H100 nodes here, or how should TP, PP, and DP be mapped across the cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0900", "title": "Evaluating WAN Links for Multi-Datacenter LLM Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the two datacenters be pooled for synchronous 175B training, and how should TP, DP, and PP be mapped?", "chain_ids": ["cloud-chain-auto-027-05"], "chain_positions": {"cloud-chain-auto-027-05": 2}, "chain_tiers": {"cloud-chain-auto-027-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0901", "title": "Parquet Batch Ingestion Memory Sizing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum batch time window avoids OOM in the 8 GiB container, and what size Parquet file does that batch produce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0902", "title": "Bare-Metal MCU Memory Allocation Calculation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the peak SRAM utilization during Cortex-M4 inference, and does the 150KB-weight INT8 CNN fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0903", "title": "Evaluating Bare-Metal SmartNIC Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 5-microsecond p99 packet SLA, should inference run on the Linux Xeon host or bare-metal SmartNIC Cortex-M7, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0904", "title": "Debugging HardFaults in Bare-Metal Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the root cause of this crash without OS-level memory profiling tools?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0905", "title": "Diagnosing Small Batch Instability in 3D CNNs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and resolve this architectural bottleneck causing erratic inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0906", "title": "Batch vs Streaming Ingestion for CTR Models", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the Ads CTR pipeline use continuous Kafka streaming or a scheduled 15-minute 1GB Parquet batch ingestion system, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0907", "title": "SyncBatchNorm Communication Overhead", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much data does one GPU transmit during the SyncBN forward-pass Ring AllReduce for 1024 FP32 channels on 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0908", "title": "Normalization for Micro-Batch 3D Segmentation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes the 3D U-Net instability at batch size 1 per GPU, and how should the normalization layers be redesigned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0909", "title": "LLM Batch Size Memory Limit", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What peak VRAM is required for batch size 32, and what maximum feasible batch size should be configured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0910", "title": "Diagnosing S3 Batch Ingestion Bottleneck", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of this ingestion bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0911", "title": "Analyzing Activation Memory OOM at Scale", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ResNet-50 OOM at batch size 512 despite under 1GB of weights and optimizer state, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0912", "title": "Transformer Batch Size and Memory Constraints", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What limits batch-size scaling on the 24GB GPU, what is the maximum viable batch size, and what safe cap would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0913", "title": "Optimizing Dynamic Batching Windows", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which batching window (2ms, 12ms, or 30ms) guarantees stability under the 50ms SLO?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0914", "title": "Quantifying LLM Benchmark Contamination", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the model's true accuracy on the 400 clean benchmark questions, and how much absolute accuracy inflation came from contamination?", "chain_ids": ["cloud-chain-auto-003-17"], "chain_positions": {"cloud-chain-auto-003-17": 0}, "chain_tiers": {"cloud-chain-auto-003-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0915", "title": "Diagnosing Sudden MMLU Score Spikes in LLM Pre-training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this anomaly and what mitigation should you apply?", "chain_ids": ["cloud-chain-auto-003-17"], "chain_positions": {"cloud-chain-auto-003-17": 1}, "chain_tiers": {"cloud-chain-auto-003-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0916", "title": "Calculating Maximum Batching Window", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum Triton max_queue_delay can safely meet the 100ms SLA at max batch size 16?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 1}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0917", "title": "Low Traffic Latency Spikes", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does p99 latency worsen at 5 QPS, and what maximum batching_window guarantees the 100ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0918", "title": "Evaluating Contamination in Code LLMs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why can the model score 82% on HumanEval yet drop production acceptance to 22%, and how should contamination be prevented?", "chain_ids": ["cloud-chain-auto-003-17"], "chain_positions": {"cloud-chain-auto-003-17": 2}, "chain_tiers": {"cloud-chain-auto-003-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0919", "title": "Multi-Dimensional Resource Fragmentation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are 8-GPU training jobs pending for Insufficient CPU at only 60% utilization, and how is the scheduler stranding GPUs?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 1}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0920", "title": "Evaluating Multi-Dimensional Bin Packing for Stranded Capacity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What multi-dimensional scheduling heuristic would reduce stranded capacity, and how does it improve packing over GPU-only Best-Fit?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 2}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0921", "title": "Calculate Memory and Compute Savings for a BNN", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the BNN weight footprint with 5% of FP32 parameters kept at INT8, and how many 64-bit XNOR-popcount instructions are needed?", "chain_ids": ["cloud-chain-auto-secondary-011-23"], "chain_positions": {"cloud-chain-auto-secondary-011-23": 0}, "chain_tiers": {"cloud-chain-auto-secondary-011-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0922", "title": "Evaluating BNNs for Cloud-Scale Filtering", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 1M QPS traffic filter move from INT8 GPUs to a BNN on cloud FPGAs, and what are the compute and memory trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-011-23"], "chain_positions": {"cloud-chain-auto-secondary-011-23": 1}, "chain_tiers": {"cloud-chain-auto-secondary-011-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0923", "title": "Calculate Cluster Bisection Bandwidth", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the bisection bandwidth of the 1,536-GPU leaf-spine fabric, and what per-GPU bandwidth limits global Ring AllReduce?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 0}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0924", "title": "Diagnosing 4:1 Network Oversubscription in LLM Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What physical network bottleneck explains the 4x slower global AllReduce despite perfect 600GB/s intra-node NVLink utilization?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 1}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0925", "title": "BNN Throughput Regression on Cloud GPUs", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the BNN on NVIDIA T4 1.5x slower than the INT8 version despite the expected 32x memory saving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0926", "title": "Evaluating Topologies for MoE All-to-All", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 4096 GPUs running MoE All-to-All, should you choose a non-blocking Fat-Tree or a 3D Torus, and what bisection bandwidth drives the choice?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 2}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0928", "title": "Multi-Dimensional Bin Packing and Stranded GPUs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using GPU-sorted First-Fit Decreasing, how many nodes are required for the 5 fine-tuning and 10 serving jobs, and how many GPUs are stranded?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 0}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0929", "title": "Decoupling Shared Embeddings", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the shared 30GB embedding model be decoupled or versioned to stop Team A's update degrading Team B under the 15ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0930", "title": "Revenue Impact of Implicit Data Dependency Erosion", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much revenue is lost over the 72-hour weekend from the 1.5% click-yield drop, and what boundary control would prevent this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0931", "title": "Byzantine Failures in Distributed GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Over 4 weeks on 8,192 GPUs, how many Byzantine failures are expected, and what is the chance of at least one?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 0}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0932", "title": "Evaluating BSP Mitigation Strategies for Stragglers", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use Asynchronous Parallel (ASP) or mitigated BSP for this 2048-GPU training cluster, and why?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 3}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0933", "title": "Diagnosing Stragglers in BSP Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does a single GPU that is 400ms late to the BSP AllReduce affect 1,024-GPU throughput and utilization?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 2}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0934", "title": "Diagnosing Silent Data Corruption in Distributed LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose and isolate the root cause of this failure without halting the entire cluster for days?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 1}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0935", "title": "Evaluating SDC Mitigations in ZeRO-3 Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which SDC mitigation best fits the 250ms ZeRO-3 step budget, and why?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 2}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0936", "title": "Bulk Synchronous Parallel Straggler Impact", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the new BSP step time when one GPU takes 180ms to compute, and what is the compute utilization of the 63 healthy GPUs?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 1}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0937", "title": "A100 KV Cache L2 Thrashing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory hierarchy bottleneck causes the batch-16 throughput collapse, and how would you fix the kernel?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 2}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0938", "title": "GPU L1 Cache Tiling for Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the largest power-of-two square tile size B that fits Q, K, and V in shared memory after reserving 32KB?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 1}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0939", "title": "Evaluating IO-Aware Attention Tiling on A100 Caches", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which tiling strategy provides better utilization of the Tensor Cores, and why?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 3}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0940", "title": "CUDA Allocator Fragmentation Estimation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the 4GB allocation fail despite 16GB of inactive reserved memory, and how should you fix the allocator behavior?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0941", "title": "Diagnosing PyTorch OOM with Dynamic Sequence Lengths", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can the 2GB allocation fail with 25GB reserved but unallocated, and what production fix handles dynamic sequence lengths?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0942", "title": "Capacity Planning for Canary Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many dedicated g5.xlarge instances are required for the 5% v2 canary to guarantee the 200ms SLA?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 1}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0943", "title": "Mitigating Memory Fragmentation in Dynamic LLM Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate disabling the allocator versus padding to 2048 tokens, and what memory strategy should replace them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0944", "title": "Canary Traffic Batching Timeouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of the P99 latency spike, and how do you resolve it?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 2}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0945", "title": "Canary Traffic Sizing and Resource Allocation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you choose between a 1% and 10% canary split given the 100-GPU limit and CTR significance needs?", "chain_ids": ["cloud-chain-auto-001-03"], "chain_positions": {"cloud-chain-auto-001-03": 3}, "chain_tiers": {"cloud-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0946", "title": "Queueing Delay in Provisioning", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did provisioning 75 instances fail at 50 RPS, and how many instances are needed to meet the 5s P99 SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0947", "title": "SLA-Constrained Inference Capacity", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the 19-GPU Batch-32 plan violate the 100ms P99 SLO, and how many GPUs are actually required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0948", "title": "GPU Capacity Planning with Batching Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming negligible batch accumulation time, how many GPUs are required to handle 4,000 RPS under the 150ms P99 SLO while minimizing cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0949", "title": "WAN Overhead in Carbon Scheduling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did moving the 10-hour 1,024-GPU job to EU-North increase carbon despite the cleaner grid?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0950", "title": "Centralized Checkpointing Network Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the absolute minimum time required just to transfer the checkpoint data from the workers to the coordinator?", "chain_ids": ["cloud-chain-auto-004-03"], "chain_positions": {"cloud-chain-auto-004-03": 0}, "chain_tiers": {"cloud-chain-auto-004-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0951", "title": "Carbon-Aware Workload Shifting", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many metric tons of CO2 are saved by delaying the 250-node, 50-hour preprocessing job by 12 hours?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 0}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0952", "title": "Evaluating Centralized Checkpointing Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why can't the centralized head-node checkpoint meet the 2-minute SLA, and what architecture should replace it?", "chain_ids": ["cloud-chain-auto-004-03"], "chain_positions": {"cloud-chain-auto-004-03": 2}, "chain_tiers": {"cloud-chain-auto-004-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0953", "title": "Centralized Checkpoint Incast Failure", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What bottlenecks cause the 4.5-minute centralized checkpoint stall and rank-0 packet drops?", "chain_ids": ["cloud-chain-auto-004-03"], "chain_positions": {"cloud-chain-auto-004-03": 1}, "chain_tiers": {"cloud-chain-auto-004-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0954", "title": "Evaluating Elastic Training for Carbon Minimization", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do Region 1 continuous training and Region 2 solar-only intermittent training compare on CO2 and time-to-market?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0955", "title": "Scaling Randomized Smoothing Certification", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you serve N=10,000 randomized-smoothing inferences under a 250ms P99 synchronous SLA?", "chain_ids": ["cloud-chain-auto-secondary-015-20"], "chain_positions": {"cloud-chain-auto-secondary-015-20": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0956", "title": "Certified Radius Calculation for Biometric API", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum top-class probability p_A is required with σ=0.5 to certify radius R=0.5?", "chain_ids": ["cloud-chain-auto-secondary-015-20"], "chain_positions": {"cloud-chain-auto-secondary-015-20": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0957", "title": "ZeRO-3 Checkpoint Stalls on Lustre", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the 7TB checkpoint take 20 minutes instead of 35 seconds, and how would local NVMe staging unblock training?", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 0}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0958", "title": "Diagnosing Distributed Checkpoint IO Storm Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What causes the checkpoint-induced collective timeouts, and what checkpointing architecture should replace the central NFS path?", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 1}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0959", "title": "Mitigating Checkpoint Storms in LLM Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you buy 500GB/s Lustre or use two-tier local-NVMe checkpointing for the 2.45TB checkpoints, and why?", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 2}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0960", "title": "Optimal Checkpoint Frequency for LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the optimal checkpointing interval for a 1.6TB checkpoint, 80GB/s writes, and 24-hour MTBF?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0961", "title": "Diagnosing Checkpoint Stalls in Large Scale LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What bottleneck causes the 40-minute FSDP checkpoint stall when gathering to rank 0, and how should it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0963", "title": "Optimizing LLM Checkpoint Intervals at Scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With a 12h MTBF and 120s Lustre writes, what checkpoint interval is optimal, and is async node-local NVMe worth implementing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0964", "title": "Debugging Randomized Smoothing Radius Collapse", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does truncating randomized-smoothing samples from 100,000 to 200 collapse the certified radius to 0, and how should you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-20"], "chain_positions": {"cloud-chain-auto-secondary-015-20": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0965", "title": "Diagnosing Thread Exhaustion Cascades", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the total ThreadExhaustion outage when the feature store p99 hit 5s, and what mechanism contains it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0966", "title": "Evaluating Circuit Breaker Thresholds for Embedding APIs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is a circuit breaker preferable to retries for the 10,000QPS embedding API, and what timeout, threshold, and fallback should it use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0967", "title": "Diagnosing Clean-Label Backdoors in KYC Models", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How was the glasses backdoor embedded without label flipping, and how would you detect the poisoned samples at 10M-image scale?", "chain_ids": ["cloud-chain-auto-003-08"], "chain_positions": {"cloud-chain-auto-003-08": 1}, "chain_tiers": {"cloud-chain-auto-003-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0968", "title": "Clean-Label Poisoning Ratio Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the minimum number of clean-label poisoned samples needed to reach a 5% poisoning ratio in the 10,000-sample target class?", "chain_ids": ["cloud-chain-auto-003-08"], "chain_positions": {"cloud-chain-auto-003-08": 0}, "chain_tiers": {"cloud-chain-auto-003-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0969", "title": "Evaluating Defenses for Clean-Label Poisoning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Given a strict 72-hour cluster allocation limit, should you defend with DP-SGD or activation-space clustering, and why?", "chain_ids": ["cloud-chain-auto-003-08"], "chain_positions": {"cloud-chain-auto-003-08": 2}, "chain_tiers": {"cloud-chain-auto-003-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0970", "title": "CPO Power Savings Calculation", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-switch peak power savings from CPO and annual energy savings for the 500-switch cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0971", "title": "Evaluating CPO vs Pluggable Optics for 51.2T Switches", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 100,000-GPU fabric use 800G pluggables or CPO under the 40kW rack limit, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0972", "title": "Diagnosing Front-Panel Pluggable Power Walls", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What power breakdown explains the 1.8kW ToR draw, and would Co-Packaged Optics (CPO) resolve the rack bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0973", "title": "Cold Restart Recovery Time Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the total time to recovery after the switch failure before training can resume?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 1}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0974", "title": "Evaluating Cold Restart vs Warm Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you approve the 2-minute warm restart system or keep the 15-minute cold restart architecture, and why?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 3}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0975", "title": "Diagnosing Cold Restart Read Storms", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bottleneck causing the 45-minute cold-restart delay and S3 503s, and what checkpoint-loading architecture fixes it?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 2}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0976", "title": "Diagnosing DDP All-Reduce Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the DDP all-reduce bottleneck, and which fixes reduce exposed communication or increase overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0977", "title": "Evaluating MoE All-to-All Topology", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you spend $2M on 800 Gbps NDR or keep 400 Gbps and restrict EP=8 to intra-node NVLink groups, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0978", "title": "Diagnosing Stalled Computation During NCCL AllGather", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the hardware failing to overlap these operations?", "chain_ids": ["cloud-chain-auto-005-14"], "chain_positions": {"cloud-chain-auto-005-14": 1}, "chain_tiers": {"cloud-chain-auto-005-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0979", "title": "Evaluating Gradient Bucketing for Overlap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which gradient bucket size and CUDA stream configuration should you use for FSDP on 64 GPUs to maximize communication-computation overlap?", "chain_ids": ["cloud-chain-auto-005-14"], "chain_positions": {"cloud-chain-auto-005-14": 2}, "chain_tiers": {"cloud-chain-auto-005-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0980", "title": "Calculating Data Parallel Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Ring All-Reduce communication time per step and comm-to-compute ratio, and is the 16-node job communication-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0981", "title": "Data Parallelism on 10 Gbps Ethernet", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-step Ring All-Reduce communication and compute times, and is the 16-instance fine-tuning job communication-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0982", "title": "Evaluating Network Upgrades vs ZeRO-3 for LLM Scaling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 512-GPU run use 800 Gbps InfiniBand or ZeRO-3 on 400 Gbps, and how does each affect the comm/compute ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0983", "title": "Calculating Gradient Bucketing Overlap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 4 gradient buckets, perfect CUDA-stream overlap, and no scheduling overhead, what is the new backward-plus-sync time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0984", "title": "Diagnosing High Communication-Computation Ratio in 3D Parallelism", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 25% SM utilization, and how would you change TP/PP placement to avoid inter-node tensor-parallel AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0985", "title": "Evaluating Spatial Placement for Fused Attention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should Softmax run sequentially on the MXU or be spatially pipelined on the VPU, and what is the resulting block latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0986", "title": "Systolic Placement of Depthwise Convolutions", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does mapping the 3x3 depthwise convolution onto the 128x128 systolic array yield under 1% utilization, and where should it run instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0987", "title": "Computational Graph Node Fusion", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the total execution times before and after fusing the 5 element-wise graph nodes, and what speedup does fusion provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0988", "title": "Evaluating Operator Fusion in Computational Graphs", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you compile the graph to fuse LayerNorm, GeLU, and residual adds, and how much HBM traffic does fusion save for a 16MB activation?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 3}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0989", "title": "Wall-Clock Training Time Estimation for 175B Model", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many days will it take to train the 175B model on 1T tokens using 1,024 A100s at 45% MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0990", "title": "Weight vs Output Stationary Dataflow", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many weight-matrix element reads does WS versus OS require for this 256x256 tiled XW layer, and which placement minimizes weight reads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0991", "title": "Diagnosing Low MFU in LLM Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the primary bottleneck behind the 35% MFU, and how would you change the pipeline schedule or micro-batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0992", "title": "Diagnosing OOM from Retained Graphs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing GPU memory to grow by 2.5GB per iteration, and how should the loss logging code be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0994", "title": "Profiling Network Upgrades in ViT Training", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did doubling the ViT-Huge cluster network bandwidth from 200Gbps to 400Gbps reduce step time by only about 2%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0995", "title": "Evaluating MFU vs Network Upgrades", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you spend 6 weeks raising MFU to 55% or upgrade the interconnect, and what training-time savings justify the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0996", "title": "LLM Dense Layer Profiling", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the [2048,4096] x [4096,4096] FP16 projection compute-bound or memory-bound on an A100, and what is its minimum latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0997", "title": "Diagnosing MLP Optimization Bottlenecks", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Will 2x weight compression speed up the ViT MLP layer, or is it compute-bound under the Roofline model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0998", "title": "Evaluating Upgrades for High-Intensity GEMMs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you upgrade to A100 80GB memory bandwidth or use 2:4 sparsity for the GEMM SLA miss, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-0999", "title": "CXL vs InfiniBand for DLRM Embeddings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the average embedding lookup latencies for two-node InfiniBand sharding versus single-node CXL expansion, and which is lower?", "chain_ids": ["cloud-chain-auto-008-11"], "chain_positions": {"cloud-chain-auto-008-11": 0}, "chain_tiers": {"cloud-chain-auto-008-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1000", "title": "Evaluating CXL Memory Pooling for DLRM", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 10TB DLRM embeddings, should you choose RDMA remote memory or rack-scale CXL 3.0 pooling for random lookups, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1001", "title": "DLRM Bottleneck with CXL Pooling", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did CXL 2.0 memory pooling hurt embedding lookup tail latency despite low bandwidth use, and how should embeddings be tiered?", "chain_ids": ["cloud-chain-auto-008-11"], "chain_positions": {"cloud-chain-auto-008-11": 1}, "chain_tiers": {"cloud-chain-auto-008-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1002", "title": "Diagnosing Silent Failures in CTR Prediction", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this silent model failure and architect a mitigation strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1003", "title": "Financial Impact of Drift Mitigation", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the net daily financial impact of moving to daily retraining if it recovers the CTR to 4.5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1004", "title": "Mitigating Concept Drift in High-Throughput Fraud Detection", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the anti-fraud system use 5-minute online learning or 6-hour micro-batch retraining for stable P(X) but falling precision, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1005", "title": "Evaluating OOD Rejection Under Strict Latency", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which calibration strategy should the MRI API use to abstain on OOD scanner artifacts under the 200ms P99 latency budget?", "chain_ids": ["cloud-chain-auto-004-10"], "chain_positions": {"cloud-chain-auto-004-10": 2}, "chain_tiers": {"cloud-chain-auto-004-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1006", "title": "Debugging RoCEv2 Congestion Spreading in a Clos Network", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can Job A's RoCEv2 incast and PFC pauses stall Job B in other racks, and what mitigations would you apply?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 1}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1007", "title": "Incast-Driven PFC Congestion Spreading Time", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 4:1 incast take to trigger PFC at the 4MB threshold, and how does that affect unrelated Spine traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1008", "title": "Diagnosing OOD Overconfidence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this overconfidence and what calibration technique resolves it without requiring a full retraining cycle or violating the latency constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1010", "title": "RoCEv2 Fabric-Wide PFC Congestion Spreading", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What mechanisms drove the PFC pause storm, and how would DCQCN with lower ECN thresholds contain it?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 2}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1011", "title": "Spot Preemption Batch Adjustment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you adjust your data loading and training loop parameters to resume training without altering the mathematical optimization dynamics?", "chain_ids": ["cloud-chain-auto-005-03"], "chain_positions": {"cloud-chain-auto-005-03": 0}, "chain_tiers": {"cloud-chain-auto-005-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1012", "title": "Unstructured Sparsity Without Sparse Kernels", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 50% unstructured pruning fail to speed training, and what should you change to get real sparsity benefits?", "chain_ids": ["cloud-chain-auto-005-03"], "chain_positions": {"cloud-chain-auto-005-03": 1}, "chain_tiers": {"cloud-chain-auto-005-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1013", "title": "Elastic Scale-Down with Constant Global Batch Size", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "When the fleet scales from 64 to 16 nodes with GBS locked at 2048, how should you adjust micro-batch and accumulation under 40GB VRAM?", "chain_ids": ["cloud-chain-auto-005-03"], "chain_positions": {"cloud-chain-auto-005-03": 2}, "chain_tiers": {"cloud-chain-auto-005-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1014", "title": "Diagnosing Static Preprocessing Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are these operations burning GPU cycles at runtime, and how do you eliminate this overhead without altering the numerical output?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 2}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1015", "title": "Quantifying Late-Stage Constraint Costs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the cost of discovering the T4 16GB constraint after training versus profiling it upfront, and what process change prevents it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1016", "title": "Calculating Wasted FLOPs from Unfolded Constants", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many FLOPs are wasted per inference by computing the fixed causal mask and frozen W_1 @ W_2 at runtime instead of constant folding them?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 1}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1017", "title": "Constant Folding Dense Normalization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What memory-traffic benefit should you expect from constant folding the 1M-feature normalization subgraph on an L4?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 3}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1018", "title": "LLM Deployment VRAM Constraint Propagation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What caused the OOM on the T4s despite INT8 weights, and how should the serving constraints have propagated back into architecture design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1019", "title": "70B Fraud Model Quantization on a 24GB GPU", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is post-training 4-bit quantization and pruning a viable salvage strategy for the 70B FP16 fraud model on a single 24GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1020", "title": "In-Place Rolling Deployment VRAM Saturation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the in-place zero-downtime rollout of two 15GB DLRM models on 16GB GPUs spike P99 latency, and what architecture fixes it?", "chain_ids": ["cloud-chain-auto-001-02"], "chain_positions": {"cloud-chain-auto-001-02": 2}, "chain_tiers": {"cloud-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1021", "title": "Canary Rollout Time Estimation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much total time is required to reach 100% traffic at 400 RPS under the 5,000-request and 10-minute bake rules?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1022", "title": "Continuous Deployment VRAM Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which deployment strategy should you use for 15GB CTR model updates on 24GB L4s to avoid OOMs and P99 latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1023", "title": "Continuous Training Frequency Optimization", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "To maximize net profit, how many days should you wait between retraining runs?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 1}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1024", "title": "Seasonality-Induced Drift Triggers", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the likely root cause of the hyper-active retraining, and how do you prove it?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 2}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1025", "title": "CNN Layer Compute and Memory Sizing", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For one 1024x1024x3 tile, what are the MACs and FP16 output feature-map footprint of the 7x7 stride-2, 64-channel convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1026", "title": "Continuous Fine-Tuning vs From-Scratch Retraining", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural and modeling risks are introduced by daily CFT, and how do you quantitatively justify the decision?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 3}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1027", "title": "Evaluating 3D vs Factorized Convolutions for Video", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you retain the standard 3D ResNet on 16GB T4s or replace it with factorized (2D spatial + 1D temporal) convolutions, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1028", "title": "Diagnosing Depthwise Conv Underutilization", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do MobileNet depthwise convolutions show under 20% SM utilization on A100 even at batch size 256?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1029", "title": "Diagnosing Pipeline Calibration Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did Platt scaling make CTR calibration look normal while overall ad revenue fell 15% after retrieval recall increased?", "chain_ids": ["cloud-chain-auto-004-14"], "chain_positions": {"cloud-chain-auto-004-14": 1}, "chain_tiers": {"cloud-chain-auto-004-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1030", "title": "Hidden Correction Cascades in Bidding Pipelines", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the over-bid per impression and daily loss from the stale 1.25x CTR correction, and how should you prevent this cascade?", "chain_ids": ["cloud-chain-auto-004-14"], "chain_positions": {"cloud-chain-auto-004-14": 0}, "chain_tiers": {"cloud-chain-auto-004-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1031", "title": "Diagnosing Correlated GPU Node Failures in a Cluster", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What root cause explains exactly those 16 co-located nodes dropping simultaneously in the 512-GPU training cluster?", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 2}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1032", "title": "Mitigating Correlated Rack Failures in 2048-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the trade-offs between node-local NVMe checkpointing and cross-rack asynchronous checkpointing, considering the impact of a correlated failure on training progress and cluster utilization?", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 3}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1033", "title": "Financial Impact of Counterfeit Transceivers", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the expected additional monthly downtime cost caused by a 15% counterfeit rate among the 256 optical transceivers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1034", "title": "Evaluating Pipeline Correction Cascades", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you accept the fallback-ad and CTR temperature-scaling hotfixes, or fix the CG model directly, and why?", "chain_ids": ["cloud-chain-auto-004-14"], "chain_positions": {"cloud-chain-auto-004-14": 2}, "chain_tiers": {"cloud-chain-auto-004-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1035", "title": "Evaluating Grey-Market GPU Fleet Reliability", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you integrate the unauthorized baseboards or delay 6 months for verified hardware, given 15% ECC errors and throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1037", "title": "Diagnosing Anomalous SDC and ECC Errors in GPU Clusters", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What likely explains the refurbished A100 ECC, SDC, thermal, and firmware anomalies, and what definitive diagnostic would you run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1038", "title": "Diagnosing Covariate Shift in Fraud Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Is the fraud model suffering concept drift or covariate shift, and how would you diagnose and adapt without waiting 4 weeks for labels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1039", "title": "Rack-Level Correlated Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the 14-day job failure probability for packing 64 nodes into 2 racks versus spreading them across 8 racks?", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 1}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1040", "title": "Evaluating Covariate Shift Mitigation in CTR Prediction", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a PSI of 0.35 with unchanged P(Y|X), should you fully retrain the CTR model or use importance weighting, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1041", "title": "Calculating Latency Impact of CPU Affinity", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you use CPU affinity and NUMA binding to bring 4-thread BERT requests under the 15ms SLA?", "chain_ids": ["cloud-chain-auto-011-05"], "chain_positions": {"cloud-chain-auto-011-05": 0}, "chain_tiers": {"cloud-chain-auto-011-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1042", "title": "Evaluating CPU Pinning for P99 Latency SLA", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt strict CPU affinity on the dual-socket EPYC inference fleet despite possible throughput loss, and why?", "chain_ids": ["cloud-chain-auto-011-05"], "chain_positions": {"cloud-chain-auto-011-05": 2}, "chain_tiers": {"cloud-chain-auto-011-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1043", "title": "Diagnosing AVX-512 Remainder Loop Bottlenecks", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the vectorization efficiency so low, and how do you root-cause and fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1044", "title": "Evaluating AVX-512 VNNI for Custom Kernels", "topic": "extreme-quantization", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you hand-write AVX-512 VNNI intrinsics for the custom INT8 attention kernel or refactor to oneDNN-backed operators, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1045", "title": "Calculate AVX-512 INT8 Peak Throughput", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical peak INT8 MAC throughput per clock cycle for one Ice Lake core using AVX-512 VNNI and 2 FMA units?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1046", "title": "Diagnosing P99 Latency Jitter on CPU Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and resolve this performance jitter?", "chain_ids": ["cloud-chain-auto-011-05"], "chain_positions": {"cloud-chain-auto-011-05": 1}, "chain_tiers": {"cloud-chain-auto-011-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1047", "title": "Calculating Credit Assignment Compute Requirements", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long would one optimization step take with finite differences versus standard reverse-mode autodiff for the 1.5B-parameter Transformer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1048", "title": "Diagnosing Early-Layer Credit Assignment Failure", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why have gradients collapsed in the early layers of the Post-LayerNorm Transformer, and what architecture change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1049", "title": "Diagnosing InfiniBand Scaling Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are millions of 10KB AllReduce operations scaling poorly on 400Gbps InfiniBand, and is upgrading to 800Gbps the right fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1050", "title": "Gradient Synchronization Critical Message Size", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What critical message size makes payload transfer time equal the 2µs startup latency on a 400Gbps RoCEv2 network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1051", "title": "CUDA Graphs for Low-Latency Inference", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the end-to-end latency before and after CUDA Graphs for 250 kernels with 4µs GPU time and 5µs launch overhead?", "chain_ids": ["cloud-chain-auto-005-05"], "chain_positions": {"cloud-chain-auto-005-05": 0}, "chain_tiers": {"cloud-chain-auto-005-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1052", "title": "Evaluating Fabrics via Critical Message Size", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which fabric minimizes communication for 1MB updates, what are their critical message sizes, and when should you switch to Fabric A?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1053", "title": "CUDA Graphs vs Kernel Fusion for LLM Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate these two optimization options to hit the 50ms SLA?", "chain_ids": ["cloud-chain-auto-005-05"], "chain_positions": {"cloud-chain-auto-005-05": 2}, "chain_tiers": {"cloud-chain-auto-005-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1054", "title": "CUDA Graphs for H100 Decode Launch Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the H100 idle 40% during Llama-3 8B decode with 3µs kernels, and how would you mitigate it?", "chain_ids": ["cloud-chain-auto-005-05"], "chain_positions": {"cloud-chain-auto-005-05": 1}, "chain_tiers": {"cloud-chain-auto-005-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1055", "title": "Evaluating Unbounded Credit Assignment Strategies", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "If exact credit assignment over a 32,768-step training horizon is required, should you use TBPTT with a 2,048-step window or full BPTT with gradient checkpointing, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1056", "title": "Diagnosing H2D Transfer Overlap Failures", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are H2D copies and compute still sequential despite pin_memory=True and non_blocking=True, and how do you overlap them?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1057", "title": "Evaluating Multi-Stream Overlap for Inference Pipelines", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What throughput speedup can a 3-stream H2D/compute/D2H pipeline deliver, and what constraints could prevent reaching it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1058", "title": "Diagnosing Custom ASIC Underutilization with Dynamic Shapes", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the custom ASIC show 3x higher P99 latency at 25% utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1059", "title": "NCCL AllReduce vs Backward Compute Overlap on a 4-GPU T4 Box", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "On 4x T4s with no NVLink and a 2 GB FP32 gradient over PCIe, what does bucketed AllReduce overlap save versus a post-backward single AllReduce, and where do bucket-size returns diminish?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1060", "title": "Custom ASIC TCO Break-Even Evaluation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At what GPU-equivalent deployment scale does the custom ASIC break even, and what architectural risk must justify it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1061", "title": "Sizing an NER-based PII Redaction Fleet", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GPUs are needed to handle 10,000 RPS of 256-token DistilBERT PII anonymization at 50% utilization given a 125 TFLOPS capacity per GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1062", "title": "CPU Bottleneck in PII Anonymization", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 2.5s P99 latency spike despite 45% GPU utilization, and how should the DAL be deployed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1063", "title": "ASIC TCO Break-Even Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many ASICs are required to break even on the $50M NRE after normalizing for QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1065", "title": "Evaluating Real-World Generalization vs Public Benchmarks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which model should you deploy for the 100,000/day medical imaging API, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1066", "title": "Diagnosing Production Accuracy Collapse in Cloud CV", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is production accuracy 72% despite 94% offline accuracy, and how would you recover it?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 1}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1067", "title": "Evaluating Local vs Managed Data Anonymization Layers at Scale", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use the managed DLP API or local BERT-Large DAL for 100 RPS, and how do latency, utilization, cost, compliance risk, operational burden, HA, and failure modes affect the decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1068", "title": "Diagnosing Silent Feature Corruption", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the FPR spike after retraining, and why did standard validation metrics fail to catch it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1069", "title": "Upstream Cleaning vs Downstream Robustness", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you absorb the 2% label noise downstream or build the 100,000 CPU-hour cleaning pipeline, and what is the cost trade-off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1070", "title": "Compounding Costs of Data Cascades", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the production-stage data cascade cost versus the wasted 64-A100 training compute cost?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 0}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1071", "title": "Debugging Accuracy Plateau in Defect Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you allocate your ML engineering resources to unblock the 98% target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1072", "title": "Calculating Transfer Time for a 20PB Dataset", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical minimum transfer time, and what architecture circumvents this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1073", "title": "Evaluating Data-Centric Upgrades vs Model Scaling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade to ViT-Large or mine hard negatives to retrain ResNet-50, given the 50ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1074", "title": "Data Lineage Overhead at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What daily lineage storage overhead and ingest bandwidth does 50,000 TPS create, and how should you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1075", "title": "Foundation Model Data Gravity Trade-offs", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you stream the 20 PB dataset over dual 100 Gbps links or migrate it to S3 before training on 4,096 H100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1076", "title": "Cost of Noisy Data Scaling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the total data-plus-compute costs for 50M noisy images versus 5M cleaned images, and which path is cheaper?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1077", "title": "Diagnosing Pipeline Drops via Data Lineage", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you utilize data lineage tracking to analyze the provenance of this feature and root-cause the regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1078", "title": "Evaluating Inline vs Out-of-Band Data Lineage at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use inline provenance metadata or an asynchronous lineage registry at 50,000 TPS, and what are the bandwidth and latency trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1079", "title": "Storage Cost of Naive Data Lineage", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the 30-day S3 Standard storage cost ($0.023/GB-month) of full-copy lineage versus pointer-based lineage for the 5TB daily log pipelines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1080", "title": "Row-Level vs Partition-Level Lineage", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 10TB LLM corpus use row-level UUID lineage or partition-level hashes and artifact URIs, and why?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 2}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1081", "title": "Diagnosing Mutable Lineage Failures", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the reconstructed feature distributions mismatch the MLflow stats, and what lineage change ensures reproducibility?", "chain_ids": ["cloud-chain-auto-011-12"], "chain_positions": {"cloud-chain-auto-011-12": 1}, "chain_tiers": {"cloud-chain-auto-011-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1082", "title": "Local vs Cloud Inference Offloading", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Should inference run locally or in the cloud for each 50GB batch, and what are the total processing times?", "chain_ids": ["cloud-chain-auto-002-07"], "chain_positions": {"cloud-chain-auto-002-07": 0}, "chain_tiers": {"cloud-chain-auto-002-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1083", "title": "Cross-Environment Batch Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the GPUs at only 2.5% utilization on the 5 PB genomic job, and what architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1084", "title": "Cross-Region Training vs Data Locality", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 100 TB daily trading logs be trained locally in Frankfurt or transferred to New York, and what is the end-to-end time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1085", "title": "Ring-AllReduce Communication Overhead Calculation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Assuming a standard Ring-AllReduce implementation and no compute overlap, what is the exact network communication time per step?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 2}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1087", "title": "Gradient Clipping vs Data Poisoning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many poisoned samples appear per global batch, and how does per-sample clipping at C=1.0 change their relative gradient contribution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1088", "title": "Diagnosing DDP Network Bottlenecks", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Under hierarchical AllReduce, why does utilization drop from 98% to 42% across 4 nodes, and what software-only fix improves scaling?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 3}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1089", "title": "Evaluating Defenses for LLM Continuous Pre-training Poisoning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which backdoor defense should you choose considering security guarantees and compute economics?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1090", "title": "Diagnosing Targeted Data Poisoning in LLM Fine-Tuning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you determine whether the refund-intent drop is targeted data poisoning or concept drift, and isolate the corrupted training records?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1091", "title": "S3 Small File GPU Starvation Diagnosis", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is bottlenecking the S3 dataloader, and how should the image dataset be laid out to restore GPU utilization?", "chain_ids": ["cloud-chain-auto-011-13"], "chain_positions": {"cloud-chain-auto-011-13": 1}, "chain_tiers": {"cloud-chain-auto-011-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1092", "title": "Optimizing S3 Throughput with Data Sharding", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What single-thread throughput do 100KB S3 reads achieve versus 500MB shards, and how does sharding fix the bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1093", "title": "Calculating Data Stall Ratio for Vision Transformers", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Assuming perfect asynchronous data prefetching where computation and I/O overlap entirely, what is the Data Stall Ratio for the accelerators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1094", "title": "Evaluating Storage Sharding vs Caching for Multimodal Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you deploy the 125TB Redis cache or convert the 500M samples into 1GB WebDataset shards, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1095", "title": "Mitigating Data Stalls in Distributed Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade the central file system to 300 GB/s or deploy local NVMe caches, and what is the quantitative justification?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1096", "title": "Lustre Parallel File Striping Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this severe data loading bottleneck, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1097", "title": "Checkpoint Loading via Data Striping", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum stripe count is needed to load the 2.5TB checkpoint in 20 seconds from 200Gbps storage nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1098", "title": "Optimizing Stripe Size for 4 TB/s AI Storage", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What stripe size and number of storage targets would you choose for 2 GB TFRecord files to hit 4 TB/s without excess metadata overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1099", "title": "Diagnosing ViT Data Pipeline Stalls", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the data stall ratio, and is the 30 ms gap caused by network I/O or CPU preprocessing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1100", "title": "AOT Dataset Compilation for H100 GPU Clusters", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the network and CPU requirements to determine if you should use JIT decoding or AOT dataset compilation to feed the 32,000 images/sec?", "chain_ids": ["cloud-chain-auto-secondary-005-05"], "chain_positions": {"cloud-chain-auto-secondary-005-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1101", "title": "Evaluating WS vs OS Dataflows for LLM Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which dataflow should the 256x256 array use for the 8192x8192 layer in prefill versus batch-1 decoding, and what HBM traffic drives the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1102", "title": "Diagnosing GEMM Memory Bandwidth Bottlenecks", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the root cause and optimize the dataflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1103", "title": "Diagnosing Data Loader Bottlenecks in Vision Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze and resolve this data pipeline bottleneck using a dataset compilation approach?", "chain_ids": ["cloud-chain-auto-secondary-005-05"], "chain_positions": {"cloud-chain-auto-secondary-005-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1104", "title": "Weight-Stationary Tiling DRAM Bandwidth", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GB of DRAM reads are required for activation matrix A under the TPU's 32MB weight-stationary tiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1105", "title": "Evaluating Data Compilation Strategies", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade to 192-core CPU nodes or use offline dataset compilation for the 64-GPU image pipeline, and why?", "chain_ids": ["cloud-chain-auto-secondary-005-05"], "chain_positions": {"cloud-chain-auto-secondary-005-05": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1106", "title": "Demographic Data Scaling Shortfall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If you rely on naive random data collection from a pipeline where Demographic A has a 1% true occurrence rate, how many raw images must you ingest to close the gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1107", "title": "Diagnosing Regional Performance Degradation via Datasheets", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you use Datasheets for Datasets to perform a root-cause analysis of this failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1109", "title": "Diagnosing PFC Storms in DCQCN", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze why DCQCN is failing to prevent PFC triggering during incast microbursts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1110", "title": "Trade-offs of DCQCN Parameters in RoCEv2 AI Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you focus on aggressive DCQCN tuning or switch buffer/PFC threshold tuning for the MoE incast congestion, and why?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 2}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1111", "title": "DLQ Storage and Reprocessing Provisioning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What DLQ storage is required per 24-hour period, and what repair-job throughput is needed to drain a 24-hour backlog in a 4-hour window?", "chain_ids": ["cloud-chain-auto-004-12"], "chain_positions": {"cloud-chain-auto-004-12": 0}, "chain_tiers": {"cloud-chain-auto-004-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1112", "title": "Diagnosing Poison Pill Bottlenecks in Streaming Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you root-cause this blockage and re-architect the pipeline to restore throughput without losing the corrupted transaction records?", "chain_ids": ["cloud-chain-auto-004-12"], "chain_positions": {"cloud-chain-auto-004-12": 1}, "chain_tiers": {"cloud-chain-auto-004-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1113", "title": "Evaluating DLQ Architectures for High-Throughput Streams", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which DLQ strategy—synchronous Postgres, asynchronous Kafka, or stdout logging—meets the 5-second freshness SLA, and why?", "chain_ids": ["cloud-chain-auto-004-12"], "chain_positions": {"cloud-chain-auto-004-12": 2}, "chain_tiers": {"cloud-chain-auto-004-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1114", "title": "Declarative Autoscaling Calculation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many L4 GPU replicas will KServe provision for 2,500 RPS at 60 ms latency with concurrency 16 and 75% target utilization?", "chain_ids": ["cloud-chain-auto-001-15"], "chain_positions": {"cloud-chain-auto-001-15": 0}, "chain_tiers": {"cloud-chain-auto-001-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1115", "title": "Evaluating Declarative API Sidecar Overheads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does declarative scale-to-zero justify the per-pod sidecar overhead for 500 low-traffic models, and what trade-off gates adoption?", "chain_ids": ["cloud-chain-auto-001-15"], "chain_positions": {"cloud-chain-auto-001-15": 2}, "chain_tiers": {"cloud-chain-auto-001-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1116", "title": "DCQCN Incast Buffer Sizing", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What peak switch buffer occupancy occurs before DCQCN takes effect, and what minimum buffer size avoids PFC pauses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1117", "title": "Silent Saturation in Declarative Autoscaling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the CPU-based declarative autoscaler fail at 600 QPS, and what metric should drive scaling instead?", "chain_ids": ["cloud-chain-auto-001-15"], "chain_positions": {"cloud-chain-auto-001-15": 1}, "chain_tiers": {"cloud-chain-auto-001-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1118", "title": "Control Loop Reconciliations for Gang Scheduling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the controller performs an immediate reconciliation at t=0 and then every 15 seconds, how many reconciliation cycles observe drift before the 3 autoscaled nodes become ready at t=240s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1119", "title": "Control Loop Thrashing in Gang Scheduling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What flaw causes the 32-GPU job's control-plane thrashing, what is the impact, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1120", "title": "Evaluating Declarative Scheduling for GPU Fault Tolerance", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What MTTR improvement does declarative scheduling provide over imperative scripts under 15% hourly churn, and what control-plane risk remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1122", "title": "Evaluating Demographic Parity Trade-offs in Cloud Resume Screening", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What happens to Precision and Equal Opportunity if you enforce strict Demographic Parity despite Group M and Group N having 25% vs 10% base rates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1123", "title": "Multi-Core Power Trade-offs Post-Dennard Scaling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which 300W CPU option provides better theoretical throughput for parallel ML workloads like embedding lookups, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1124", "title": "Memory Scaling of Widened Dense Layers", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much additional FP32 GPU memory is needed when the dense layer grows from 8192x8192 to 32768x32768 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1126", "title": "Diagnosing Thermal Throttling in Multi-Core CPU Migrations", "topic": "mlops-lifecycle", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 64-core 2.4 GHz instance thermal throttle despite lower per-core frequency, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1127", "title": "Power Limits in Post-Dennard Era", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the expected total power consumption and theoretical throughput of the 8-PE 1.0 GHz design relative to the 100W 2.0 GHz baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1128", "title": "CTR MLP Low-Rank Factorization Memory Bandwidth Trade-off", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you replace the dense CTR MLP layers with rank-256 low-rank factorizations, and what are the bandwidth and capacity trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1129", "title": "Diagnosing Low Tensor Core Utilization in Cloud MLPs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are the FP16 MLP matmuls failing to dispatch to Tensor Cores, and how do you resolve the dimension-alignment issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1130", "title": "Versioned Embedding Cache Sizing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What peak Redis memory is required to dual-serve 100M V1 768-d and V2 1024-d FP16 embeddings with 20% overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1131", "title": "Transformer FFN FLOPs Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many FLOPs does the [2048,8192] x [8192,32768] FFN forward pass require, and what is its FP16 lower-bound latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1132", "title": "Evaluating GPU Upgrades vs Software Batching for MLPs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Will upgrading from A100 to H100 deliver a >3x latency reduction for batch-1 FP16 inference, or should you implement continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1133", "title": "Diagnosing Downstream Degradation in Cascaded Models", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the silent 12% CTR drop after the upstream BERT embedding update, and how would a dependency-aware registry prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1134", "title": "Intra-Rack 400G Cabling Power and Cost", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 32 intra-rack 400G links under 2.5 m, should you use passive DAC or AOC, and what cost and power differences drive the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1135", "title": "Multi-Stage Embedding Model Migration", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you safely migrate from the 2048-d v1 embedding model to the 768-d v2 model while managing the temporary GPU cost surge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1136", "title": "400G DAC Link Degradation in Dense Racks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely physical root cause, and how do you systematically diagnose and mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1137", "title": "Diagnosing Hidden Disparities in API Performance", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze the inference logs to diagnose the root cause of these targeted failures despite healthy aggregate metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1138", "title": "Intra-Rack 400G Interconnect Trade-offs", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 2m intra-rack 400G links, would you use passive DACs or AOCs, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1139", "title": "Subgroup Accuracy Disparities", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minority-group accuracy and error-rate disparity does disaggregated evaluation reveal, and how should that affect deployment?", "chain_ids": ["cloud-chain-auto-003-16"], "chain_positions": {"cloud-chain-auto-003-16": 0}, "chain_tiers": {"cloud-chain-auto-003-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1140", "title": "eKYC Disaggregated Evaluation Strategy", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the 100,000-user random holdout insufficient to validate the 2% subgroup, and what sample size is mathematically required to bound their FRR variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1141", "title": "Evaluating Disaggregated Serving for LLMs", "topic": "compound-ai-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much TTFT does transferring the 8K-prompt KV cache add over 200 Gbps, and is disaggregated serving viable for a strict 500ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1142", "title": "KV Cache Transfer in Disaggregated Serving", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What end-to-end TTFT does the user see after prefill, 2GB KV transfer over 200 Gbps, and first-token decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1143", "title": "Calculating Disparate Impact in Cloud Loan APIs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the Disparate Impact Ratio for Group B versus Group A, and does it violate the standard four-fifths (80%) fairness rule?", "chain_ids": ["cloud-chain-auto-003-07"], "chain_positions": {"cloud-chain-auto-003-07": 0}, "chain_tiers": {"cloud-chain-auto-003-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1145", "title": "Evaluating Disparate Impact in Cloud-Based Credit Scoring", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use threshold post-processing or adversarial debiasing to raise DIR to 0.80 under the 50ms P99 SLA?", "chain_ids": ["cloud-chain-auto-003-07"], "chain_positions": {"cloud-chain-auto-003-07": 2}, "chain_tiers": {"cloud-chain-auto-003-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1146", "title": "Calculating Dispatch Tax in Eager Mode", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the actual forward-pass latency and effective GPU utilization once PyTorch dispatch overhead is included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1147", "title": "Diagnosing Dispatch Overhead in Narrow Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of the 16% GPU utilization, and what optimization should you use instead of rewriting kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1148", "title": "Diagnosing Bottlenecks in Disaggregated LLM Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the decode-pool TPOT degradation and OOMs for 8192-token contexts over 100 Gbps Ethernet?", "chain_ids": ["cloud-chain-auto-006-02"], "chain_positions": {"cloud-chain-auto-006-02": 1}, "chain_tiers": {"cloud-chain-auto-006-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1149", "title": "CPU Dispatch Overhead in Eager Execution", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What per-operation dispatch tax does the profile imply, and what native PyTorch optimization would eliminate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1150", "title": "Mitigating Dispatch Overhead in Low-Latency Models", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you upgrade to H100s, use TorchInductor fusion, or wrap the TTS step in CUDA Graphs to reduce P99 latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1151", "title": "Choosing JIT Graph Compilation over Custom CUDA for 2ms GPU Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you build a custom monolithic CUDA kernel or use a JIT graph compiler to meet the SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1152", "title": "Diagnosing Low GPU Utilization in Eager Mode GNNs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the GPU utilization so low, and how would you resolve this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1153", "title": "Centralized vs. Distributed Checkpointing Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long would centralized checkpointing take versus distributed checkpointing for the 2.1TB FSDP state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1154", "title": "Distributed ZeRO-3 Checkpointing Across Object-Storage Prefixes", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you checkpoint the 2.1TB ZeRO-3 state to minimize pause time and avoid object-storage prefix throttling?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1155", "title": "Calculating the Financial Impact of Feature Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the daily financial impact and extra manual-review volume from the FPR rising from 2% to 8% at 500 TPS?", "chain_ids": ["cloud-chain-auto-003-10"], "chain_positions": {"cloud-chain-auto-003-10": 0}, "chain_tiers": {"cloud-chain-auto-003-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1156", "title": "Diagnosing Rank-0 Bottlenecks in Massive Model Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is rank-0 checkpointing taking over 20 minutes and timing out, and what checkpointing architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1157", "title": "Diagnosing Silent Model Degradation in E-Commerce Recommendations", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze this degradation to pinpoint the root cause of the silent failure?", "chain_ids": ["cloud-chain-auto-003-10"], "chain_positions": {"cloud-chain-auto-003-10": 1}, "chain_tiers": {"cloud-chain-auto-003-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1158", "title": "Mitigating Transient Covariate Shift in Recommendations", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 24-hour promo covariate shift, should you emergency-retrain on 4 hours of data or toggle the trending-items fallback?", "chain_ids": ["cloud-chain-auto-003-10"], "chain_positions": {"cloud-chain-auto-003-10": 2}, "chain_tiers": {"cloud-chain-auto-003-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1159", "title": "Evaluating Overparameterization and Double Descent in Vision Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Under what conditions is scaling to 150M parameters theoretically justified, and how do you reconcile it with the 50ms inference latency budget?", "chain_ids": ["cloud-chain-auto-005-13"], "chain_positions": {"cloud-chain-auto-005-13": 2}, "chain_tiers": {"cloud-chain-auto-005-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1160", "title": "Diagnosing the Interpolation Threshold Error Spike", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the team diagnose the 1.2M-parameter validation spike before deciding whether to shrink, regularize, or scale?", "chain_ids": ["cloud-chain-auto-005-13"], "chain_positions": {"cloud-chain-auto-005-13": 1}, "chain_tiers": {"cloud-chain-auto-005-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1161", "title": "Pruning a Vision Model to Fit a 2 GFLOP Budget", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "After pruning 80% of weights, what effective parameter count and forward-pass compute remain, and does the model fit the 2 GFLOP inference budget?", "chain_ids": ["cloud-chain-auto-005-13"], "chain_positions": {"cloud-chain-auto-005-13": 0}, "chain_tiers": {"cloud-chain-auto-005-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1162", "title": "Root-causing Deep MLP Training Collapse", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this stalled training, and how would you analyze the contribution of the learning rate schedule and initialization to this failure state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1163", "title": "Wasted Activation VRAM from Dying ReLUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much VRAM (in MB) is wasted storing zero activations for the backward pass across all 4 ReLU layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1164", "title": "Evaluating Activation Trade-offs for Dead Neurons", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you replace ReLU with GeLU or SwiGLU despite extra activation FLOPs, and what is the hardware trade-off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1165", "title": "Dynamic Batching Delay Calculation", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum max_batch_size guarantees the 50ms SLA when max_batch_delay must be 10ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1166", "title": "Debugging Dynamic Batching Latency Spikes", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 hit 130ms during bursts, and how should max_queue_delay be reconfigured?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 2}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1167", "title": "Rolling Window Dynamic Benchmarking Cost", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum rolling window N you can afford under the $1,500 monthly dynamic benchmarking budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1168", "title": "Evaluating Dynamic Benchmarks for Code Generation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design the daily dynamic benchmark, and why use hybrid LLM-as-judge plus 5% HITL instead of pure HITL?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1170", "title": "Dynamic Batching for Strict Latency SLOs", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the largest max_batch_size from the table that stays under the 70ms server-side budget, and what conservative max_queue_delay would you choose?", "chain_ids": ["cloud-chain-auto-021-09"], "chain_positions": {"cloud-chain-auto-021-09": 3}, "chain_tiers": {"cloud-chain-auto-021-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1171", "title": "Dynamic Batching Timeout Trade-offs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does max_batch_size 16 with a 60ms batch_timeout satisfy the 100ms P99 SLA, and what timeout would be safe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1172", "title": "GPU Voltage Scaling Dynamic Power Reduction", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What percentage reduction in dynamic power results from lowering voltage by 15% and frequency by 10%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1173", "title": "Dynamic Batching Wait Window Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What max dynamic batching wait window and batch size keep the translation API within the 200ms P99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1174", "title": "Evaluating DVFS for Cluster Power Capping", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use frequency-only throttling or DVFS to stay under the 20MW power limit while maximizing aggregate throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1175", "title": "Diagnosing Non-Linear Power Drops in GPU DVFS", "topic": "mlops-lifecycle", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did a 25% SM clock reduction cut dynamic power by nearly 58% instead of the projected 25%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1176", "title": "DVFS Power Cap for High-Density GPU Clusters", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What percentage reduction in clock frequency is needed to cap each GPU at 300W, and how much does theoretical throughput drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1177", "title": "Diagnosing P99 Latency Spikes in Dynamic Inference", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 latency 250ms with low GPU utilization, and what preprocessing and feature-fetch changes would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1178", "title": "Diagnosing DVFS Stragglers in AllReduce", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the intermittent 1200ms spikes, and what system-level mitigation stabilizes iteration time?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1179", "title": "Eager Execution Dispatch Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the CPU dispatch time, GPU compute time, maximum GPU utilization, and best fix for the 50,000 tiny matmuls per forward pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1180", "title": "Diagnosing Low GPU Utilization in Dynamic Dispatch", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the GPU starved, and how do you analyze and resolve this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1181", "title": "Eager vs Static Execution in Dynamic GNNs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate eager versus static execution for this dynamic GNN, and what production deployment strategy would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1182", "title": "Evaluating DVFS Power Capping for LLM Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which power-capping strategy should the 35kW H100 rack use, uniform DVFS downclocking or selectively idling GPUs, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1183", "title": "Diagnosing ECMP Hash Collisions in RoCEv2 Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What root cause explains the spine uplink imbalance, and what telemetry would confirm ECMP hash collisions?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 1}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1184", "title": "Evaluating ECMP for RoCEv2 GPU Training Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What risks does pure ECMP pose for 4,096-GPU synchronous LLM training, and what network architecture would you choose instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1185", "title": "Single-Flow ECMP Hashing Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the max uplink utilization, what routing behavior causes the 100Gbps cap, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1186", "title": "Calculating the Efficiency Frontier Trade-off", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum batch size and throughput does each model achieve under the 100ms SLA, and what throughput-mAP trade-off results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1187", "title": "LLM Concurrency and KV Cache Thrashing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did raising max concurrency from 64 to 256 spike p99 TTFT and reduce throughput on the 8xH100 70B service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1188", "title": "Evaluating the LLM Efficiency Frontier", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you downsize to an 8B FP16 model or run a W4A16 70B model on 2 A100s to cut cost by 50%, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1189", "title": "Spot Instance Preemption and Batch Scaling", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "After preemption to 48 GPUs, what are the new global batch size, learning rate, and steps per 1,228,800-image epoch?", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 1}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1190", "title": "Diagnosing TorchElastic Spot Preemption Stalls", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What causes the 15-minute TorchElastic stall after a node preemption, and how would you reduce it?", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 2}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1191", "title": "Evaluating Preemption Overheads in Elastic LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Assuming a 15-minute checkpoint interval for Framework A, which framework maximizes goodput over a 14-day run, and why?", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 3}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1192", "title": "Elephant Flow Collisions with ECMP", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why are some RoCEv2 ports saturated during the 130GB AllReduce while others idle, and how would you fix the routing?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 0}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1193", "title": "Diagnosing ECMP Hash Collisions", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1194", "title": "Evaluating ECMP vs Adaptive Routing for Elephant Flows", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you mitigate AllReduce ECMP collisions with switch Adaptive Routing or a Rail-Optimized topology, and why?", "chain_ids": ["cloud-chain-auto-002-05"], "chain_positions": {"cloud-chain-auto-002-05": 2}, "chain_tiers": {"cloud-chain-auto-002-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1195", "title": "Resource Exhaustion in ELT Feature Generation", "topic": "compound-ai-systems", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the resource exhaustion, and how should the transformation pipeline be restructured to handle this scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1197", "title": "Evaluating ELT Architecture for ML Feature Agility", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you transition to ELT for 50TB/day raw telemetry, and what are the cost/flexibility trade-offs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1198", "title": "DLRM Embedding Sharding Strategy", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you shard the 200GB user table and four 5GB tables across 8x80GB A100s to maximize DLRM throughput without OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1199", "title": "Carbon Footprint Calculation for GPU Clusters", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 3-year lifecycle carbon footprint for the 1,024-GPU cluster, and what percentage is embodied carbon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1200", "title": "DLRM Embedding Table Sizing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory does the 1B-by-128 FP32 Adam embedding table require, and how many 80GB GPUs are minimally needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1201", "title": "Diagnosing DLRM Load Imbalance", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this bottleneck and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1202", "title": "Analyzing Lifecycle Carbon in GPU Refresh Cycles", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the fundamental flaw in calling the renewable-powered cluster zero-carbon while refreshing GPUs every 1.5 years?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1203", "title": "Lifecycle Carbon Analysis of Hardware Refresh", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you evaluate the true carbon impact of this hardware refresh over a 3-year depreciation cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1204", "title": "GPU Energy Efficiency for Embedding Extraction", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much daily energy does Setup A versus Setup B use to process 100M documents, and which is more energy-efficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1205", "title": "Non-Linear Power Scaling in LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this non-linear scaling of power vs. throughput?", "chain_ids": ["cloud-chain-auto-secondary-009-16"], "chain_positions": {"cloud-chain-auto-secondary-009-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1206", "title": "Energy-Movement Invariant in Feature Pipelines", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which approach is more energy-efficient, and what is the approximate energy gap per request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1207", "title": "Evaluating GPU Energy Metrics under Diurnal Load", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is 60s nvidia-smi power insufficient for INT8 versus FP16 rollout, and what energy metric should guide deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1208", "title": "Evaluating Edge vs Centralized Processing for Telemetry Data", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you centralize 5 PB in us-east-1 or keep telemetry regional if WAN energy is 0.01 kWh/GB and training uses 10,000 H100-hours at 700W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1209", "title": "LLM Inference Energy Bottleneck", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 70B batch-1 inference token, how much energy goes to HBM reads versus FP16 compute, and which dominates?", "chain_ids": ["cloud-chain-auto-secondary-009-15"], "chain_positions": {"cloud-chain-auto-secondary-009-15": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1210", "title": "Diagnosing Cross-Region Training Energy Spikes", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the true energy bottleneck in the 50TB/day CTR pipeline, and why won't upgrading to H100s with FP8 fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1211", "title": "Evaluating Kernel Fusion Energy Trade-offs", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you accept this proposal, and how does the 20% FLOP penalty impact the overall energy consumption per token?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1212", "title": "Root-Causing Memory Power in LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 20B batch-1 decoding service draw 180W with only ~0.1W of ALU work, and how would you reduce it?", "chain_ids": ["cloud-chain-auto-secondary-009-15"], "chain_positions": {"cloud-chain-auto-secondary-009-15": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1213", "title": "Diagnosing High TDP in Low-Util Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do A100s hit 400W TDP at batch size 1 despite only 20% tensor-core utilization, and what mitigations reduce the draw?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1214", "title": "LLM Inference Energy Movement Bottleneck", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will a 15x INT8 MAC energy reduction meaningfully cut fleet power for batch-1 70B decoding with the same HBM, and why?", "chain_ids": ["cloud-chain-auto-secondary-009-15"], "chain_positions": {"cloud-chain-auto-secondary-009-15": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1215", "title": "Calculating Arithmetic Power for LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum active compute power required strictly for the arithmetic operations, ignoring memory access and static system leakage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1217", "title": "Resume Screening True Positive Rate Trade-offs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach should you use to achieve Equality of Opportunity for the resume API under the 100ms SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1218", "title": "Resume Screening API Equality of Opportunity Threshold Tuning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the current True Positive Rates for Groups A and B, and how many additional true positives does Group B need for Equality of Opportunity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1221", "title": "Erasure Coding Storage Footprint", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much raw storage does RS(10,4) require for 10 PB of data, and how many simultaneous drive failures per stripe can it tolerate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1222", "title": "Exabyte-Scale LLM Data Erasure Coding Migration", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you migrate the 200 PB active training dataset from 3-way replication to RS(10,4), and what throughput trade-offs drive the decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1223", "title": "Cluster MTBF from ECC Uncorrectable Errors", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the cluster MTBF from uncorrectable HBM errors, and is a 4-hour checkpoint interval acceptable?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 1}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1224", "title": "Diagnosing High Correctable ECC Error Rates", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can 10,000 correctable ECC errors/sec cause a 20% step-time slowdown, and how should it be addressed?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 2}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1225", "title": "Object Storage Tail Latency Spikes", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What storage-level mechanism causes these rare but severe tail latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1226", "title": "Randomized Thresholds for Equalized Odds", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What randomization probabilities over the two ROC operating points achieve Equalized Odds at exactly FPR=0.15 and TPR=0.75 for Groups A and B?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1227", "title": "Applying Error Feedback in Top-k Gradient Sparsification", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With error feedback, what is the next compensated gradient, will it be transmitted, and what FP16 residual memory overhead is required per GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1228", "title": "Evaluating HBM3 ECC overhead at 24k GPU scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should ECC be enabled or disabled on the 24,576-GPU H100 cluster, and how does the cluster MTBF affect the throughput trade-off?", "chain_ids": ["cloud-chain-auto-004-15"], "chain_positions": {"cloud-chain-auto-004-15": 3}, "chain_tiers": {"cloud-chain-auto-004-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1229", "title": "Diagnosing Top-k Gradient Divergence", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this divergence, and how do you fix it while maintaining the 99% compression ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1230", "title": "Evaluating Error Feedback Memory Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does error feedback cause OOM for the 20B model, and how would you fix it while preserving the communication reduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1231", "title": "Analyzing Silent Data Corruption Propagation in 3D Parallelism", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you trace a single-GPU FP16 silent data corruption through TP=8, PP=8, DP=32 to isolate the failing H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1232", "title": "SDC Propagation vs Checksum Trade-off", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 500B job, would you checksum every gradient sync or only checkpoint states, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1233", "title": "AllReduce Fault Propagation Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What total wall-clock time is needed to achieve 1,000 hours of effective training progress under this fault model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1234", "title": "Energy Savings with Event-Driven Activation", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What daily energy and dollar savings does the event-driven camera pipeline achieve after accounting for the always-on filter?", "chain_ids": ["cloud-chain-auto-015-06"], "chain_positions": {"cloud-chain-auto-015-06": 0}, "chain_tiers": {"cloud-chain-auto-015-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1235", "title": "Evaluating Event-Driven Activation for Cloud Video Analytics", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy vs. latency trade-offs of this design: under what hardware and data conditions does the overhead of the event-generation logic negate the overall power savings?", "chain_ids": ["cloud-chain-auto-015-06"], "chain_positions": {"cloud-chain-auto-015-06": 2}, "chain_tiers": {"cloud-chain-auto-015-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1236", "title": "Diagnosing GPU Power in Event-Driven SNNs", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 98%-sparse SNN still run GPUs at 100% utilization, and what execution change is needed for energy savings?", "chain_ids": ["cloud-chain-auto-015-06"], "chain_positions": {"cloud-chain-auto-015-06": 1}, "chain_tiers": {"cloud-chain-auto-015-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1237", "title": "Model Weight Exfiltration Timing", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How large is the FP16 7B model theft payload, how long does exfiltration at 100 Mbps take, and does it evade the 10 GB/10-min alarm?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1238", "title": "Diagnosing Model Exfiltration via HostPath", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 10 Gbps, 2-minute egress spike, and what vulnerability allowed the 70B model weights to be stolen?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1239", "title": "Evaluating Confidential Computing for Exact Model Theft", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture best prevents exact extraction of the 140GB model by an insider, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1240", "title": "Diagnosing H2D Transfer Serialization on A100 GPUs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is H2D transfer serialized with compute despite using pinned memory and non_blocking=True, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1241", "title": "Overlapping Transfer and Compute via Streams", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "At 32MB/ms H2D bandwidth, what total end-to-end time is needed to process 1,000 requests in naive synchronous execution versus pipelined pinned-memory CUDA streams?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1242", "title": "Evaluating Stream Overlap for H2D Transfers", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What speedup does overlapping the 10ms H2D transfer with 40ms compute provide, and what are the associated memory costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1243", "title": "Calculating Expected Calibration Error", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the Expected Calibration Error for the two confidence bins over the 10,000-sample test set?", "chain_ids": ["cloud-chain-auto-004-10"], "chain_positions": {"cloud-chain-auto-004-10": 0}, "chain_tiers": {"cloud-chain-auto-004-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1244", "title": "Diagnosing Overconfident Predictions in Cloud Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the 94%-accurate classifier failing confidence-based routing, and what low-latency post-hoc calibration fix would you apply?", "chain_ids": ["cloud-chain-auto-004-10"], "chain_positions": {"cloud-chain-auto-004-10": 1}, "chain_tiers": {"cloud-chain-auto-004-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1245", "title": "Choosing Temperature Scaling to Reduce ECE Under a 2ms SLA", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which low-latency calibration method should you try to reduce ECE within the 2ms SLA, and how does ECE guide the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1246", "title": "KV Cache External Fragmentation Constraints", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many new requests needing 150MB contiguous KV cache blocks can be admitted, and how much free memory is unusable fragmentation?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 1}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1247", "title": "Diagnosing KV Cache External Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose this discrepancy between reported free memory and the OOM failures, and how do you resolve it?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 2}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1248", "title": "Evaluating KV Cache Allocators for Variable Sequences", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use compacting garbage collection or PagedAttention-style allocation to fix KV cache fragmentation, and why?", "chain_ids": ["cloud-chain-auto-008-01"], "chain_positions": {"cloud-chain-auto-008-01": 3}, "chain_tiers": {"cloud-chain-auto-008-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1249", "title": "Calculating Optimal Checkpoints for Fail-Stop Errors", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using Young's formula, what are the optimal checkpoint interval and daily checkpointing overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1250", "title": "Diagnosing NCCL Timeout in Fail-Stop Node Crashes", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the fail-stop node crash take 30 minutes to surface in NCCL, and how would you configure the fabric to detect it faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1251", "title": "Optimizing Fail-Stop Checkpoint Intervals", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is the proposed 12-hour interval sound, and what optimal interval is justified by Daly's formula?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1252", "title": "Rack-Aware Capacity Provisioning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many total nodes are needed to survive one rack failure when spreading the 6-node baseline across 3 racks versus 4 racks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1253", "title": "Evaluating Topologies for Cross-Rack Redundancy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you uniformly spread active replicas across all 10 racks, or strictly isolate primary/replica clusters to specific PDU boundaries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1254", "title": "Correlated Rack-Level Training Failure", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the 128-node training job fail despite an 8-node redundancy buffer, and what topology flaw caused it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1255", "title": "Estimating Cluster Failure Frequency", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many node failures should the fault-tolerance system expect during the 30-day run?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 2}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1256", "title": "Evaluating Optimal Checkpoint Frequency at Scale", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What checkpoint interval maximizes training goodput for the 16,384-GPU cluster given 40,000-hour node MTBF and 5-minute saves?", "chain_ids": ["cloud-chain-auto-004-11"], "chain_positions": {"cloud-chain-auto-004-11": 3}, "chain_tiers": {"cloud-chain-auto-004-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1257", "title": "Large-Cluster MTBF and Goodput Analysis", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of the frequent interruptions and the core bottleneck affecting goodput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1258", "title": "Calculating Historical Fair-Share Priority", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the dynamic priority scores for Teams Vision and NLP, and which team receives the next 32 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1259", "title": "Fair-Share Scheduling Trade-offs for Bursty Workloads", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you reduce the decay half-life to 24 hours or add hierarchical quotas with node reservation, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1260", "title": "Diagnosing MMD Fairness Bottlenecks", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 150ms to 1200ms slowdown, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1261", "title": "Starved Urgent Jobs in Fair-Share Clusters", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Team X's 16-GPU debug job starve, and what scheduler change would support urgent debugging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1262", "title": "MMD Penalty Overhead in Fairness Training", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much FP32 memory and how many FLOPs does the 8192x8192 MMD distance matrix add per batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1263", "title": "Resolving Conflicting Fairness Metrics in Credit Scoring APIs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate these constraints and architect a viable deployment strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1264", "title": "Evaluating Adversarial Debiasing Trade-offs in Large-Scale Credit Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you keep adversarial debiasing, or switch to a lighter fairness intervention to hit DI ≥0.80 within 48 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1267", "title": "Sizing a Non-Blocking Fat-Tree Network", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 64-port switches are needed for a 1:1 non-blocking 2-tier fat-tree for 2,048 GPUs, split between leaf and spine?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 0}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1268", "title": "Diagnosing Bisection Bandwidth Drops in a GPU Cluster", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What routing-level architectural issue in the fat-tree network is causing this specific bottleneck?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 1}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1269", "title": "Non-blocking vs Oversubscribed Fat-Tree", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you choose a 1:1 or 2:1 oversubscribed fat-tree for 65,536-GPU MoE training, and why?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 2}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1270", "title": "Chaos Testing Distributed Training Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many faults should you inject, and what is the overall cluster availability percentage during this 100-hour test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1271", "title": "Diagnosing Deadlocks During Simulated GPU Fault Injection", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 1024-GPU job hang after the injected Xid 48 failure, and what NCCL logs and settings are needed to test recovery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1272", "title": "Continuous Fault Injection in Synchronous Distributed Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you inject random node failures every 2 hours in the 4,096-GPU production run, or test in staging, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1273", "title": "Cluster Hard Fault MTBF Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the expected cluster MTBF from hard HBM and transceiver faults for the 10,000-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1274", "title": "Diagnosing Co-tenant Voltage Fault Injections", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What attack explains the co-tenant-correlated misclassifications, and how does it bypass enclave memory isolation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1275", "title": "Evaluating Cloud Enclave Fault Injection Mitigations", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which mitigation should you choose against Plundervolt-style key leakage, and what are the performance, security, and TCO trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1276", "title": "Evaluating Fault Models for Large-Scale GPU Clusters", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 100 daily hardware errors all trigger fail-stop restarts, or should you use a multi-tier fault model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1277", "title": "Diagnosing Network Saturation in Sharded Embedding Lookups", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the frontend 100 Gbps NIC saturating, and how should DLRM batching change across the 8 embedding servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1278", "title": "Cloud FPGA Fault Injection Query Cost", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many API requests and dollars would the attacker need to extract the 4,000x1,000 dense layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1279", "title": "Evaluating Feature-Parallel Batching for DLRM", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you switch to feature-parallel batching or upgrade to 800 Gbps NICs for the 50,000 QPS DLRM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1280", "title": "Diagnosing Limp-Ware Fault Models in Synchronous Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What fault model explains the >40% throughput drops, and how can you turn the degraded PCIe link into an orchestrator-handled failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1281", "title": "Optimizing DLRM with Feature-Parallel Batching", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much CUDA launch overhead is saved by switching from request-parallel to feature-parallel batching for 512 requests and 80 features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1282", "title": "Calculating GPU Feeding Tax for Distributed Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the feeding tax when 8 GPUs demand 1.2 GB/s of images but the NAS supplies only 800 MB/s?", "chain_ids": ["cloud-chain-auto-003-19"], "chain_positions": {"cloud-chain-auto-003-19": 0}, "chain_tiers": {"cloud-chain-auto-003-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1283", "title": "Diagnosing GPU Starvation in Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the GPUs averaging only 40% utilization despite DataLoader tuning, and what I/O change is needed?", "chain_ids": ["cloud-chain-auto-003-19"], "chain_positions": {"cloud-chain-auto-003-19": 1}, "chain_tiers": {"cloud-chain-auto-003-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1284", "title": "Calculating HBM Traffic in Flash Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HBM traffic per attention head is saved by FlashAttention for N=8192, d=128 FP16 attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1285", "title": "Evaluating Storage Architecture to Eliminate CV Training Feeding Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Would you upgrade to 100 Gbps with WekaFS or add local NVMe caching with WebDataset, and why?", "chain_ids": ["cloud-chain-auto-003-19"], "chain_positions": {"cloud-chain-auto-003-19": 2}, "chain_tiers": {"cloud-chain-auto-003-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1286", "title": "Diagnosing SRAM Spills in Tiled Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do the Br=256 and Bc=256 attention tiles spill to HBM on A100, and what tile sizing fixes the bottleneck?", "chain_ids": ["cloud-chain-auto-014-01"], "chain_positions": {"cloud-chain-auto-014-01": 1}, "chain_tiers": {"cloud-chain-auto-014-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1287", "title": "Evaluating Flash Attention Arithmetic Intensity", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is standard 32K attention compute-bound on A100, or does Flash Attention improve wall-clock time by raising arithmetic intensity?", "chain_ids": ["cloud-chain-auto-014-01"], "chain_positions": {"cloud-chain-auto-014-01": 2}, "chain_tiers": {"cloud-chain-auto-014-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1288", "title": "HBM Bandwidth Savings with FlashAttention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much HBM traffic do S and P cause for one FP16 head at N=8192, and how much does FlashAttention reduce it?", "chain_ids": ["cloud-chain-auto-008-12"], "chain_positions": {"cloud-chain-auto-008-12": 0}, "chain_tiers": {"cloud-chain-auto-008-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1289", "title": "Diagnosing FlashAttention Recomputation Optimization", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would caching the NxN attention probabilities to save backward FLOPs degrade throughput despite 20GB free HBM?", "chain_ids": ["cloud-chain-auto-008-12"], "chain_positions": {"cloud-chain-auto-008-12": 1}, "chain_tiers": {"cloud-chain-auto-008-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1290", "title": "Evaluating FlashAttention vs Standard Attention for 32K Context", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use activation checkpointing with standard attention or adopt FlashAttention for 32K context, and why?", "chain_ids": ["cloud-chain-auto-008-12"], "chain_positions": {"cloud-chain-auto-008-12": 2}, "chain_tiers": {"cloud-chain-auto-008-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1291", "title": "Latency Overhead of RS-FEC in PAM4 Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total one-way end-to-end latency and the percentage of this latency introduced solely by FEC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1292", "title": "Jacobian Computation for Low-Dimensional Inputs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which automatic differentiation mode should be used to compute the Jacobian, and how many passes will it require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1293", "title": "Diagnosing 400G PAM4 Latency Floors", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can you disable FEC to lower 400G RoCEv2 hop latency, and what causes the ~1.2μs latency floor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1294", "title": "Diagnosing Jacobian Bottlenecks in Many-Output Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do reverse-mode VJPs OOM for a 65,536x25 Jacobian, and which autodiff strategy should replace them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1295", "title": "Evaluating Forward-Mode AD for Jacobian Regularization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 500x5 PINN Jacobian be computed with forward-mode AD instead of reverse-mode, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1296", "title": "Distributed LLM Training Deadlock and Gang Scheduling", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many GPU-hours are wasted before NCCL times out, and what scheduling paradigm prevents this partial allocation?", "chain_ids": ["cloud-chain-auto-021-05"], "chain_positions": {"cloud-chain-auto-021-05": 0}, "chain_tiers": {"cloud-chain-auto-021-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1297", "title": "Evaluating FEC Trade-offs in 800G PAM4 Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should FEC be disabled on 2-meter 800G PAM4 DAC links to save 100–150 ns per hop, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1298", "title": "Diagnosing PyTorch DDP Partial Allocation Deadlocks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this resource fragmentation and resolve the cluster-wide deadlock?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1299", "title": "Evaluating Gang Scheduling for LLM Training", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use an all-or-nothing gang scheduler or app-level timeouts for these 16-node FSDP jobs, and what is the compute waste impact?", "chain_ids": ["cloud-chain-auto-021-05"], "chain_positions": {"cloud-chain-auto-021-05": 2}, "chain_tiers": {"cloud-chain-auto-021-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1300", "title": "Diagnosing CPU Inefficiency in Transformer Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are 112 CPU cores at 100% utilization achieving under 5% peak TFLOPs for batch-size=1 Transformer inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1301", "title": "Evaluating Accelerator Efficiency Overheads", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 10 TFLOP/500W AVX-512 CPU fleet far less power-efficient than the 200 TFLOP/400W accelerator for dense inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1302", "title": "Go-Back-N Congestion Penalty During AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With Go-Back-N and 2,000 unacknowledged 4KB packets, how much data is retransmitted after the single dropped packet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1303", "title": "Quantifying the CPU Generality Tax in Batch Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the pJ/FLOP for the Xeon and T4, and what annual power cost savings come from moving the 10 PFLOP/s workload to T4s at $0.10/kWh and 1.2 PUE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1304", "title": "Diagnosing RoCEv2 Throughput Collapse Under Minor Packet Loss", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would such a microscopic packet drop rate cause an 80% degradation in goodput, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1305", "title": "Go-Back-N Penalty in 800G RoCEv2 Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you accept the vendor's Go-Back-N RoCE NICs for 2GB AllReduce at 800Gbps with 0.1% packet loss, and what bandwidth penalty results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1306", "title": "GPU DALI Preprocessing Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If JPEG decoding moves to the GPU with DALI, what are the new per-image latency and PCIe transfer volume savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1307", "title": "CPU-Bound Image Preprocessing Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is bottlenecking the A100 image classification service at 400 RPS, and how should the preprocessing pipeline be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1308", "title": "Evaluate GPU Preprocessing for Video Analytics", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the video pipeline send 200KB compressed frames to the GPUs and decode with DALI instead of transferring 6.2MB raw frames, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1309", "title": "SLA Budgets Under Network Stress", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "During the outage, what is the heuristic model's maximum compute budget and how many CPU instances are needed for 10,000 RPS?", "chain_ids": ["cloud-chain-auto-secondary-015-19"], "chain_positions": {"cloud-chain-auto-secondary-015-19": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1311", "title": "Scaling Effective Batch Size via Accumulation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many gradient accumulation steps are required to reach a global batch size of 1024, and how should the loss be scaled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1312", "title": "Diagnosing DDP Overhead in Gradient Accumulation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is DDP utilization low during 32 gradient accumulation steps, and how do you fix the communication overhead?", "chain_ids": ["cloud-chain-auto-005-15"], "chain_positions": {"cloud-chain-auto-005-15": 1}, "chain_tiers": {"cloud-chain-auto-005-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1313", "title": "Sparse Gradient Accumulation on 8x A100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the compute, memory, and communication trade-offs of 8-step accumulation with optional sparse gradients?", "chain_ids": ["cloud-chain-auto-005-15"], "chain_positions": {"cloud-chain-auto-005-15": 2}, "chain_tiers": {"cloud-chain-auto-005-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1314", "title": "Degrading DLRM Ranking Under Capacity Loss", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you gracefully degrade the recommendation API after losing 60% of T4 capacity while keeping P99 under 200ms?", "chain_ids": ["cloud-chain-auto-secondary-015-19"], "chain_positions": {"cloud-chain-auto-secondary-015-19": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1315", "title": "Gradient Sparsification Compute Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did Top-1% gradient sparsification make the 10B model's step time over 2 seconds slower despite reducing payload to 200MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1316", "title": "Mitigating Gradient Inversion in Medical Federated Learning", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do batch-size-1 federated gradients allow exact X-ray reconstruction, and what mitigation prevents it with under 1% accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1317", "title": "Gradient Quantization vs Sparsification", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 10B model on 64 GPUs, should you deploy Top-1% sparsification or INT8 gradient quantization, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1318", "title": "Analytical Gradient Inversion Complexity", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For batch size 1, how many FLOPs are needed to reconstruct the 4,096-dimensional input from dW and db?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1321", "title": "Ring All-Reduce Data Volume Calculation", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much gradient data does each GPU transmit per Ring AllReduce step for the 30B BF16 model on 64 GPUs, and how long does it take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1322", "title": "Local Gradient Clipping Divergence", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does clipping gradients locally before AllReduce make the 13B DDP run diverge, and what is the correct ordering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1323", "title": "Evaluating Sync vs Async Gradient Strategies", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you switch the 7B LLM training job to Async Parameter Servers or keep Synchronous AllReduce with 8-bit compression, and why?", "chain_ids": ["cloud-chain-auto-023-14"], "chain_positions": {"cloud-chain-auto-023-14": 1}, "chain_tiers": {"cloud-chain-auto-023-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1325", "title": "Diagnosing torch.compile Graph Breaks", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did adding a data-dependent if statement to the torch.compile Transformer collapse throughput from 3,500 to 800 tokens/sec, and how should you fix it?", "chain_ids": ["cloud-chain-auto-023-05"], "chain_positions": {"cloud-chain-auto-023-05": 1}, "chain_tiers": {"cloud-chain-auto-023-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1326", "title": "Evaluating MoE Routing Graph Breaks in PyTorch 2.x", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you pad MoE expert capacities to static sizes or retain dynamic routing to achieve predictable sub-30ms latency with torch.compile?", "chain_ids": ["cloud-chain-auto-023-05"], "chain_positions": {"cloud-chain-auto-023-05": 2}, "chain_tiers": {"cloud-chain-auto-023-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1329", "title": "Estimating Graph Fusion Bandwidth Savings", "topic": "graph-compilation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HBM traffic and latency does compiler fusion save per attention block?", "chain_ids": ["cloud-chain-auto-023-01"], "chain_positions": {"cloud-chain-auto-023-01": 1}, "chain_tiers": {"cloud-chain-auto-023-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1330", "title": "Diagnosing Frequent Recompilations in Dynamic Transformers", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze and resolve the root cause of these latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1331", "title": "Calculating Latency Overhead of PyTorch Graph Breaks", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 3 graph breaks adding 3ms each, what is the new batch latency and how does it compare to the 20ms eager baseline?", "chain_ids": ["cloud-chain-auto-023-05"], "chain_positions": {"cloud-chain-auto-023-05": 0}, "chain_tiers": {"cloud-chain-auto-023-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1333", "title": "Diagnosing Data-Dependent Control Flow Failures in JIT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did tracing make the seq2seq model always execute 32 decoding steps, and how does Graph Scripting fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1334", "title": "Evaluating Graph Scripting vs Tracing for Dynamic Control Flow", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the Adaptive Compute Transformer use trace-based JIT or graph scripting for data-dependent refinement loops, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1335", "title": "Dynamic Loop Compilation Latency", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the average latency difference between the traced max-length model and a scripted model that preserves early exits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1336", "title": "Graph Tracing Control Flow OOM", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What happens when the traced Transformer compiled at seq_len 128 receives seq_len 1024, and how much VRAM does it require?", "chain_ids": ["cloud-chain-auto-023-06"], "chain_positions": {"cloud-chain-auto-023-06": 0}, "chain_tiers": {"cloud-chain-auto-023-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1337", "title": "Evaluating Graph Tracing Failures in Dynamic Routing Models", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did graph tracing remove the MoE safety fallback in production, and what compilation strategy should replace it?", "chain_ids": ["cloud-chain-auto-023-06"], "chain_positions": {"cloud-chain-auto-023-06": 2}, "chain_tiers": {"cloud-chain-auto-023-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1338", "title": "Silent Failures in Traced Dynamic Control Flow", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do sequence length 256 requests silently fail and execute rapidly after tracing with a [16, 64] dummy input, and how is it fixed?", "chain_ids": ["cloud-chain-auto-023-06"], "chain_positions": {"cloud-chain-auto-023-06": 1}, "chain_tiers": {"cloud-chain-auto-023-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1339", "title": "Group DRO Weight Update Calculation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using exponentiated Group DRO weights with eta_q=1.0, what is the new normalized weight for Group 3?", "chain_ids": ["cloud-chain-auto-023-02"], "chain_positions": {"cloud-chain-auto-023-02": 0}, "chain_tiers": {"cloud-chain-auto-023-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1340", "title": "Debugging Group DRO Training Instability", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this instability?", "chain_ids": ["cloud-chain-auto-023-02"], "chain_positions": {"cloud-chain-auto-023-02": 1}, "chain_tiers": {"cloud-chain-auto-023-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1341", "title": "Graph Compilation for Variable X-ray Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can per-request graph compilation meet the 200ms SLA, and what serving changes make it viable?", "chain_ids": ["cloud-chain-auto-023-02"], "chain_positions": {"cloud-chain-auto-023-02": 2}, "chain_tiers": {"cloud-chain-auto-023-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1343", "title": "Diagnosing GQA Memory Exhaustion", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the model OOM during the forward pass despite the KV cache fitting in memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1344", "title": "Diagnosing High Latency in gRPC Tensor Transfers", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does gRPC/Protobuf still take 8ms for the 2MB tensor transfer, and how would you reduce it toward the network bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1348", "title": "Evaluating gRPC Migration for Inter-Service Tensor Transfer", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will standard gRPC achieve the SLA, and what architectural adjustments might be required for optimal tensor transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1349", "title": "Diagnosing LLM Latency Spikes with KV Cache Offloading", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing TPOT to jump above 800ms with low GPU utilization and saturated PCIe traffic?", "chain_ids": ["cloud-chain-auto-012-06"], "chain_positions": {"cloud-chain-auto-012-06": 1}, "chain_tiers": {"cloud-chain-auto-012-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1351", "title": "Profiling Microsecond Kernel Power Draw", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the true average power consumption of the 800 µs kernel, and how much higher is it than the 400 W NVML estimate?", "chain_ids": ["cloud-chain-auto-015-04"], "chain_positions": {"cloud-chain-auto-015-04": 0}, "chain_tiers": {"cloud-chain-auto-015-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1352", "title": "Diagnosing Power Discrepancies in Sub-second Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 100ms nvidia-smi samples report 300W while the PDU shows much higher node power for 15ms inference bursts?", "chain_ids": ["cloud-chain-auto-015-04"], "chain_positions": {"cloud-chain-auto-015-04": 1}, "chain_tiers": {"cloud-chain-auto-015-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1353", "title": "Evaluating Discrepancies in GPU Power Measurement Techniques", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which measurement should you trust for microsecond GPU energy, and what should drive power-aware scheduling for the 8-H100 cluster?", "chain_ids": ["cloud-chain-auto-015-04"], "chain_positions": {"cloud-chain-auto-015-04": 2}, "chain_tiers": {"cloud-chain-auto-015-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1355", "title": "Triple Modular Redundancy Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the probability that the 3-GPU TMR system produces an incorrect majority output in a 30-day window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1357", "title": "Evaluating Dynamic Hardware Precision Scaling", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you rely on the accelerator's dynamic precision unit or use static mixed-precision quantization for 70B LLM serving, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1358", "title": "Tensor Core Vocabulary Padding", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this severe underutilization, and how should the system be redesigned to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1359", "title": "Evaluating GPU Redundancy Trade-offs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use DMR or TMR for the 500-GPU medical inference service, given a 100ms SLA and 40ms inference time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1360", "title": "Tensor Core Dimension Alignment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you deploy Proposal B with 1000/30000 dimensions, and how do Tensor Core alignment constraints affect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1361", "title": "Hardware Trojan Power Side-Channel Detection", "topic": "extreme-quantization", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many traces must you average so the 3-sigma noise bound is no more than the 5 mW Trojan signal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1362", "title": "FlashAttention SRAM Tiling Calculation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum symmetric block size B fits Q, K, V, and O in the 128 KB SRAM budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1363", "title": "Hardware Trojan Evidence in AI Accelerator Validation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Is the rare-token logit corruption more consistent with a hardware Trojan or benign faults, and what evidence supports that?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1364", "title": "Evaluating Hardware Trojan Mitigations", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which mitigation—logic locking, split manufacturing, or boot-time fingerprinting—best fits a 50,000-GPU synchronous LLM training fleet, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1365", "title": "Diagnosing Low SM Utilization in LLM Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the GPU at <5% SM utilization for batch-1 decoding, and why won't a cuBLAS upgrade materially improve tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1367", "title": "Hedged Requests Tail Math", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much extra QPS do P90 hedged requests add, and what is the new probability a request exceeds 65ms given single request P(>65ms) = 0.5% and P(>50ms) = 1%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1368", "title": "Debugging a Hedged Request Retry Storm", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did static 15ms hedging crash the embedding store at 15,000 QPS, and what structural fix prevents the cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1369", "title": "HBM3 vs HBM2e for LLM Serving Infrastructure", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do the two nodes trade off TTFT, decode throughput, and synchronization for low-latency 70B serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1372", "title": "Evaluating Hedged Request Thresholds for Feature Store", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you determine the optimal delay threshold for dispatching the secondary request, and what is the quantitative impact on the storage backend cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1373", "title": "Edge-Cloud Hierarchical Bandwidth Filtering", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much data does each robot upload per day with 12 ten-second safety events per hour, and how much does that save versus continuous streaming?", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 0}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1375", "title": "Incremental Allocation Deadlock Math", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many GPUs does each of the four jobs hold at deadlock, and how many GPUs must be preempted to break it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1376", "title": "Three-Tier Retail Video Analytics", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you partition compute across the Cortex-M4 cameras, Jetson edge server, and cloud to meet all network constraints while maximizing camera battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1377", "title": "Cascading Filter Failure in Hierarchical Pipelines", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does latency spike to 8 seconds during shift changes while cloud GPU utilization drops below 40%?", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 1}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1378", "title": "Diagnosing GPU Allocation Deadlock from Greedy Incremental Scheduling", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are all 512 GPUs allocated but idle, and what scheduler change would prevent this deadlock?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1379", "title": "Evaluating Deadlock Resolution in Fleet Orchestration", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you re-enable strict gang scheduling or use timeout-based preemption to maximize goodput on the 1,024-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1380", "title": "Diagnosing Memory and Latency Blowup in FHE Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FHE CNN take 45 minutes and 120GB per image, and what model changes could make it practical?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1381", "title": "End-to-End Latency of HE Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total upload-plus-compute latency for one CKKS-encrypted inference request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1382", "title": "Evaluating FHE for Cloud-Based CNN Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a CKKS FHE architecture meet the 10-second ResNet-50 encrypted X-ray SLA, and what alternative would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1383", "title": "HPCC Rate Adjustment with INT", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the flow's new transmission rate using HPCC to fully utilize the link while draining the queue in exactly one RTT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1384", "title": "Diagnosing Host DRAM Starvation in Vision DataLoader Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the host DRAM staging strategy failing, and how should the data pipeline be structurally analyzed and fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1385", "title": "Sizing Host DRAM for Decoded Tensor Staging", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much host DRAM is needed for four decoded global batches, and what sustained PCIe bandwidth is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1386", "title": "Evaluating Host DRAM Staging vs GPU DALI Pipelines", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you quantitatively evaluate whether to scale up Host DRAM/CPUs or bypass Host DRAM to decode/augment directly on GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1387", "title": "Evaluating HPCC vs DCQCN in 400GbE AI Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you deploy HPCC with INT or tune DCQCN for the 4096-GPU RoCEv2 cluster, and what are the main trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1388", "title": "Impact of I/O Jitter on Distributed Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why do 1024 gated shard reads make step I/O tail-dominated, and how should you hide the wait?", "chain_ids": ["cloud-chain-auto-003-18"], "chain_positions": {"cloud-chain-auto-003-18": 0}, "chain_tiers": {"cloud-chain-auto-003-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1389", "title": "Diagnosing Distributed I/O Jitter in Synchronous Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you quantify synchronous I/O jitter across 256 GPUs, and how would you mitigate it without over-provisioning storage?", "chain_ids": ["cloud-chain-auto-003-18"], "chain_positions": {"cloud-chain-auto-003-18": 1}, "chain_tiers": {"cloud-chain-auto-003-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1390", "title": "Mitigating Storage I/O Jitter in Checkpointing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade the PFS or implement two-stage async checkpointing for the 2.1TB state, and why?", "chain_ids": ["cloud-chain-auto-003-18"], "chain_positions": {"cloud-chain-auto-003-18": 2}, "chain_tiers": {"cloud-chain-auto-003-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1391", "title": "Diagnosing the I/O Wall in Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage bandwidth does the 8-GPU ViT training loop require, and will a 2GB/s NAS cause an I/O wall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1392", "title": "Mitigating the Multimodal I/O Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you upgrade to a Parallel File System (Lustre) or implement local NVMe WebDataset caching, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1393", "title": "Medical 3D CNN I/O Starvation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the low 45% GPU utilization on the 8x H100 node, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1394", "title": "Root-Causing HPCC Incast Drops in 400G Networks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is HPCC failing to prevent spine buffer drops during the 256-node All-to-All incast, and how should you tune it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1395", "title": "Debugging Intermittent Duplicate Features in Spot-Backed Pipelines", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause and resolve this issue without disabling Spot instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1396", "title": "Evaluating Idempotency in LLM Data Pipelines", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which resilience design should you choose for the 50 TB/day PII redaction pipeline, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1397", "title": "Evaluating im2col Memory Inflation in High-Res CNNs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For this first 3x3 convolution on 2048x2048 FP16 images, should you use im2col+GEMM, Winograd, or implicit GEMM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1398", "title": "Diagnosing Hidden Memory Costs in Convolution", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What transformation causes the OOM in this convolution layer, and how should it be implemented instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1399", "title": "im2col Memory Expansion Overhead", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What im2col intermediate memory footprint does this FP8 3x3 convolution create, and why can it OOM a 16GB T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1400", "title": "Immutable Audit Trail Storage Scaling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should you design the audit logging path, and what write throughput and 7-year WORM storage are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1401", "title": "Calculating Storage Bloat from Non-Idempotent Appends", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much 30-day storage bloat and deduplication shuffle does non-idempotent append create, and what write strategy avoids it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1402", "title": "Evaluating High-Throughput Immutable Audit Trails", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which audit architecture meets the <50ms inference SLA and 7-year WORM retention at 50,000 requests per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1403", "title": "Imperative Scheduling Topology Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the 64-GPU job stay Pending even though 100 GPUs are idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1404", "title": "Resolving Imperative Scheduling Deadlocks on GPU Nodes", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the schedulable capacity of each p4d node, and why does the strict pod request remain Pending?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1405", "title": "Parameter Server Incast Buffer Overflow Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What happens at the ToR switch when all 1,024 workers simultaneously send 2MB gradients to one parameter server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1406", "title": "Feature Store Audit Trail Bottleneck", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the P99 latency spike after synchronous PostgreSQL audit logging, and what compliant architecture avoids it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1407", "title": "Diagnosing Network Incast in Distributed Training Synchronization", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What network pattern is causing All-Reduce tail latency despite only 30% average link utilization, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1408", "title": "Evaluating Incast Mitigation in Large-Scale GPU Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you rely on PFC and deep buffers or restructure AllReduce communication to mitigate the 31-to-1 incast, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1409", "title": "CNN vs MLP Parameter Scaling", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the first-layer parameter counts of the MLP and CNN compare, and why does the CNN's inductive bias help on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1410", "title": "Analyzing ViT Generalization in Data-Constrained Cloud Environments", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the ViT-Base overfitting the 50,000 X-ray dataset while ResNet-50 generalized better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1411", "title": "Evaluating Imperative vs Declarative Scheduling for Mixed Workloads", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose between Slurm-style imperative scheduling and declarative gang scheduling to improve 65% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1412", "title": "Evaluating Inductive Bias in Data-Constrained Satellite Imagery", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the satellite anomaly pipeline use the 2B-parameter ViT or the 50M-parameter CNN given only 200,000 labels, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1413", "title": "Dynamic Batching Latency Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum safe max_batch_delay you can set with a batch size of 32 to maximize throughput without violating the 50ms P99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1414", "title": "Debugging End-to-End Visual Search Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does P90 API latency spike to 120ms at 150 QPS despite 8ms TensorRT inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1415", "title": "Multi-Stage Inference Pipeline Bottleneck Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum FPS for synchronous versus asynchronous execution, and how many preprocessing threads are required to saturate the T4 GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1416", "title": "Calculating Influence Functions for Model Debugging", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is exact inverse Hessian computation feasible for the 50M-parameter ResNet, and what approximation should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1417", "title": "Continuous vs Dynamic Batching Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should this LLM server use request-level dynamic batching or iteration-level continuous batching to meet TTFT and TBT SLOs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1418", "title": "Redesigning an End-to-End Vision Inference Pipeline", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How can you redesign this pipeline to meet the 100ms SLA, and what are the quantitative trade-offs regarding batching and GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1419", "title": "Evaluating Influence Functions for ViT Attribution", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is iHVP feasible for a 300M-parameter ViT-L/16, and what approximations make influence functions tractable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1420", "title": "Diagnosing Dynamic Batching Latency Spikes", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does P99 latency hit 450ms with max_batch_size=128 and max_queue_delay=50ms despite only 55% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1421", "title": "Diagnosing Influence Function OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the RoBERTa influence-function job OOM before one training-data evaluation step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1422", "title": "Diagnosing Input Stationary Bottlenecks in MLP Workloads", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Input Stationary dataflow bottleneck MLP layers, and how do you quantify the architectural mismatch?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 2}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1423", "title": "Input Stationary Dataflow Memory Math", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many DRAM reads do Input Stationary and Weight Stationary dataflows require for X and W on this 128x128 PE array?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1424", "title": "Architectural Trade-offs in Data Integration", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator option better minimizes TCO and tail latency for DLRM inference, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1425", "title": "Diagnosing HBM-to-SRAM Integration Bottlenecks in Attention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the HBM-SRAM integration bottleneck in standard attention on H100, and what optimization keeps Tensor Cores fed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1426", "title": "Quantifying Batch-1 LLM Weight Movement Cost", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For batch-1 decoding of the 7B FP16 model, what are the time and energy ratios of weight movement versus compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1428", "title": "Team Draft Interleaving Attribution Bias in RecSys", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the discrepancy between the A/B test and the interleaving results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1429", "title": "Mitigating P99 Jitter with Interrupt Shielding", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do NIC interrupts affect CUDA kernel submission latency at 50,000 RPS, and what CPU/IRQ configuration shields inference cores?", "chain_ids": ["cloud-chain-auto-011-03"], "chain_positions": {"cloud-chain-auto-011-03": 0}, "chain_tiers": {"cloud-chain-auto-011-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1430", "title": "Diagnosing P99 Inference Jitter from NIC Interrupts", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you root-cause and resolve this latency jitter?", "chain_ids": ["cloud-chain-auto-011-03"], "chain_positions": {"cloud-chain-auto-011-03": 1}, "chain_tiers": {"cloud-chain-auto-011-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1431", "title": "Interrupt Shielding vs NIC Coalescing for P99 Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use strict CPU core isolation or NIC interrupt coalescing to stabilize P99 latency under the 20ms SLO, and why?", "chain_ids": ["cloud-chain-auto-011-03"], "chain_positions": {"cloud-chain-auto-011-03": 2}, "chain_tiers": {"cloud-chain-auto-011-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1433", "title": "Diagnosing Hidden Bias in Cloud KYC Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do single-axis FRR metrics mask failures for darker-skinned females, and what maximum FRR could that subgroup have?", "chain_ids": ["cloud-chain-auto-003-16"], "chain_positions": {"cloud-chain-auto-003-16": 1}, "chain_tiers": {"cloud-chain-auto-003-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1434", "title": "Evaluating Interleaving vs A/B Testing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you use interleaving or A/B testing for Model B under a strict 150ms P99 SLA, and how would you manage the tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1435", "title": "Invariance Testing Compute Cost", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What daily compute time does the 10-permutation invariance suite add, and what is the application-level violation rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1436", "title": "Root-Causing Variance in Resume Screening", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do name swaps and redaction still change the resume model's score, and how would you analyze and fix the pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1437", "title": "Intersectional Bias Mitigation in Identity APIs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What remediation would you choose for the 12.5% FRR intersectional gap given a 3-month data delay and 280ms MoE latency?", "chain_ids": ["cloud-chain-auto-003-16"], "chain_positions": {"cloud-chain-auto-003-16": 2}, "chain_tiers": {"cloud-chain-auto-003-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1438", "title": "Evaluating Invariance Testing Architectures at Scale", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use dynamic LLM counterfactuals or a static precomputed perturbation dataset for the 100,000-applicant CI/CD tests, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1439", "title": "Calculating IO Overhead in Standard Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much HBM I/O time does standard attention spend materializing S and P, and what does FlashAttention-2 save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1440", "title": "Evaluating IO-Aware Attention on A100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will FlashAttention-2 speed up training at sequence length 4096 despite 15% more FLOPs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1441", "title": "Diagnosing Activation Function Memory Bottlenecks", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FP16 activation take ~500 µs on an A100 despite using under 1 GFLOP, and how should it be optimized?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1442", "title": "Applying the Iron Law of ML Systems", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which investment, A, B, or C, gives the largest absolute end-to-end latency reduction under the Iron Law?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1443", "title": "LLM Inference Bottleneck Analysis", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why will pruning 50% of linear-layer FLOPs fail to significantly reduce batch-1 token latency for the 7B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1444", "title": "Calculating Single-Token Latency on A100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected single-token latency, and what is the primary Iron Law bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1445", "title": "Evaluating Topologies via the Iron Law", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which cluster option is more cost-effective for the 70B training run, and what MFU would Option B need to break even?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1446", "title": "Diagnosing Low MFU Despite High GPU Utilization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What does the low MFU reveal despite 95% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1447", "title": "Shift-Left Constraint Validation Savings", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many A100 GPU-hours does a 0.5-hour shift-left validation step save per successfully deployed model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1448", "title": "Diagnosing High Iteration Tax in RTB Models", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the staging failure iteration tax, and what structural pipeline change would prevent 15ms latency failures after training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1449", "title": "Shift-Left KV-Cache Sizing for LLM Serving", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What wasted and added training cost does the late KV-cache discovery cause, and what workflow prevents it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1450", "title": "Calculating JSD for Disjoint Distributions", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does KL divergence crash for P=[1,0] and Q=[0,1], and what is the exact base-2 Jensen-Shannon Divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1451", "title": "Diagnosing Infinite Drift Alerts in KL Divergence", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze this failure mode and architect a more robust drift detection thresholding mechanism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1452", "title": "Drift Detection Thresholds in High-Throughput Recommendation Systems", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should automated retraining use KL divergence or JS divergence for hard drift thresholds on 256-dimensional embeddings, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1453", "title": "JIT Recompilation Spikes in Dynamic Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you root-cause and resolve these P99 latency spikes associated with JIT compilation?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 2}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1454", "title": "Quantifying Element-wise Kernel Fusion Speedup", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical speedup would a fused custom CUDA kernel achieve for this 1GB FP16 activation layer, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1455", "title": "Calculating JIT Kernel Fusion Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After JIT warmup, what is the new inference latency and total latency reduction from eliminating dispatch and fusing kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1456", "title": "Root-Causing Memory-Bound Activations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this bottleneck, and what is the structural cause within the framework?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1457", "title": "Mitigating Dispatch Bottlenecks with CUDA Graphs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total latency for 400 small MLP kernels with 5 µs per-kernel CPU launch overhead, and how do CUDA Graphs remove it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1458", "title": "Evaluating JIT Compilation for Dynamic Shapes", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you use JIT compilation for 10-to-512-token dynamic queries to meet the 5ms P99 SLA without 500ms recompilation spikes?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 3}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1459", "title": "Evaluating Fusion Strategies for Custom Layers", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which option would you choose for the 50MB memory-bound operation chain, and how do memory traffic, latency, and engineering cost compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1460", "title": "Diagnosing CPU Bottlenecks in Batch-1 Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does batch-size-1 serving miss the 30ms SLA despite high batch-32 utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1461", "title": "Calculating KL Divergence for Feature Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is D_KL(P||Q) for the age-group distributions, and should the severity-1 alert fire?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 0}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1462", "title": "Diagnosing Silent Accuracy Drops using KL Divergence", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What does the spike in KL(Production||Training) to 1.2 nats imply, and how would you diagnose the CTR drop?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 1}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1463", "title": "Memory Overhead of Teacher Logits in LLM Distillation", "topic": "knowledge-distillation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much GPU memory is needed to materialize the teacher's FP32 logits for one 16×2048 micro-batch over a 100,000-token vocabulary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1464", "title": "Evaluating KL Divergence for High-Throughput Drift Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is pure KL divergence robust enough for the 864M-sample daily drift trigger, and how would you handle unseen production bins?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 2}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1465", "title": "Debugging Online KD Bottlenecks", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this inefficiency, and how should the distillation system be redesigned?", "chain_ids": ["cloud-chain-auto-secondary-015-29"], "chain_positions": {"cloud-chain-auto-secondary-015-29": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1466", "title": "Evaluating Distillation Trade-offs for Cloud LLM Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 70B-to-7B summarizer use logit distillation, intermediate-state distillation, or both, given the capacity gap and temperature trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-015-32"], "chain_positions": {"cloud-chain-auto-secondary-015-32": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1467", "title": "CUDA Graphs vs Fusion for Inference", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would CUDA Graphs or a custom fused Triton kernel better address the dynamic batch-1 attention bottleneck, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1468", "title": "Evaluating KV Cache Memory Constraints and PagedAttention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can static KV-cache allocation support batch 128 at 2048 tokens on 4×80GB A100s, and what serving architecture should replace it?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 4}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1469", "title": "Calculate Maximum Batch Size for LLM KV Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum concurrent batch size can the 80 GiB GPU serve before KV-cache OOM at 2048 tokens?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 2}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1470", "title": "Llama-2-70B Serving OOM from Static KV-Cache Allocation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What exact KV-cache memory bottleneck causes OOM for 128 concurrent 4096-token Llama-2-70B requests on 320GB VRAM?", "chain_ids": ["cloud-chain-auto-008-14"], "chain_positions": {"cloud-chain-auto-008-14": 3}, "chain_tiers": {"cloud-chain-auto-008-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1471", "title": "KV Cache Checkpoint Bandwidth", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bandwidth is required to checkpoint 50 full 16,000-token KV caches every 5 seconds, and is it viable on 100Gbps NICs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1472", "title": "Diagnosing Network Bottlenecks in KV Cache Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do full-KV checkpoints cause 30-second failover delays and 100% network egress at 64K-token contexts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1473", "title": "Stateful Serving Fault Tolerance", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which checkpointing strategy and frequency best meet the 2.0s RTO for 64K-token Llama-3-70B sessions over 400Gbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1474", "title": "Calculating INT8 KV Cache Memory Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the FP16 KV cache memory for 64 requests at 4096 tokens, and will INT8 KV cache quantization fit the 7B model on one 80GB GPU?", "chain_ids": ["cloud-chain-auto-008-15"], "chain_positions": {"cloud-chain-auto-008-15": 0}, "chain_tiers": {"cloud-chain-auto-008-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1475", "title": "Evaluating KV Cache Quantization for Long-Context RAG", "topic": "compound-ai-systems", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much KV-cache memory does FP16 vs INT8 require at batch 64 and 32K context, and what strategy preserves long-context retrieval accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1476", "title": "Quantifying Label Quality Drift in Moderation Pipelines", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the additional monthly cost caused by label drift, and what should you implement to detect and prevent noisy labels from poisoning retraining?", "chain_ids": ["cloud-chain-auto-secondary-015-25"], "chain_positions": {"cloud-chain-auto-secondary-015-25": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1477", "title": "Diagnosing Output Degradation in INT8 KV Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this failure?", "chain_ids": ["cloud-chain-auto-008-15"], "chain_positions": {"cloud-chain-auto-008-15": 1}, "chain_tiers": {"cloud-chain-auto-008-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1478", "title": "Mitigating Annotation Degradation in Moderation Pipelines", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which labeling strategy or hybrid should you choose under the $10,000/day budget: (1) k=3 consensus, (2) 5% in-house sample, or (3) Confident Learning?", "chain_ids": ["cloud-chain-auto-secondary-015-25"], "chain_positions": {"cloud-chain-auto-secondary-015-25": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1479", "title": "Diagnosing Label Shift in E-Commerce Content Moderation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What label-shift correction should you apply when the spam prior jumps from 5% to 40% while P(Y|X) stays accurate?", "chain_ids": ["cloud-chain-auto-003-09"], "chain_positions": {"cloud-chain-auto-003-09": 1}, "chain_tiers": {"cloud-chain-auto-003-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1480", "title": "Real-Time Fraud Label Shift Adaptation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the estimated true fraud rate when 27.5% of transactions are flagged, and how should you adjust the model outputs?", "chain_ids": ["cloud-chain-auto-003-09"], "chain_positions": {"cloud-chain-auto-003-09": 0}, "chain_tiers": {"cloud-chain-auto-003-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1481", "title": "Diagnosing Annotation Degradation in Data Pipelines", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically root-cause the F1 drop, and which metrics would prove label quality drift?", "chain_ids": ["cloud-chain-auto-secondary-015-25"], "chain_positions": {"cloud-chain-auto-secondary-015-25": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1482", "title": "Evaluating Mitigation Strategies for Acute Fraud Label Shift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use full retraining, sample weighting, or post-hoc calibration for the fraud-rate jump, and why?", "chain_ids": ["cloud-chain-auto-003-09"], "chain_positions": {"cloud-chain-auto-003-09": 2}, "chain_tiers": {"cloud-chain-auto-003-09": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1483", "title": "Diagnosing P99 Latency Spikes in Triton", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 200ms+ P99 spikes despite 45% GPU utilization, and how would you fix them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1484", "title": "Calculating Maximum Allowable Latency Jitter for RTB", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum allowable jitter under the 40ms SLA, and how much must the current 45ms jitter be reduced?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1485", "title": "Mitigating Tail Latency Jitter in Real-time LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which is more cost-effective for cutting OS-induced P99 jitter below 250ms: CPU pinning and NUMA isolation or DPDK/RDMA, and how would you justify this quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1486", "title": "Layer Normalization Memory Cost", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the naive LayerNorm memory traffic and theoretical time, and how do they compare with a fused implementation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1487", "title": "Evaluating Compute vs. Network Hardware Upgrades", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you upgrade the 512-GPU cluster to H100s while keeping 200 Gbps InfiniBand, and what does distributed efficiency predict?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1488", "title": "Diagnosing Distributed Training Scaling Failure", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the 5x compute upgrade to 8x A100 nodes increase 30B training throughput by only 38%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1489", "title": "Applying the Law of Distributed Efficiency", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Ring All-Reduce communication time, and how does halving compute time affect the communication-to-computation ratio and speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1490", "title": "Diagnosing LLM Data Exfiltration Across Layers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did TLS, IAM, and a WAF fail to stop patient-record extraction, and what ML-specific defenses are missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1491", "title": "Evaluating Normalization at Micro-Batch Scale", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does BatchNorm break down with micro-batch size 2 across 128 GPUs, and what normalization should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1492", "title": "Latency Overhead in Layered Defense", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the new end-to-end latency with TEE, input sanitization, and DP noise, and does it meet the 125ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1493", "title": "LLM Layered Defense Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 8B guardrail run on the H100s or an L4 node, considering TTFT and H100 KV-cache utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1494", "title": "Root-Causing Throughput Collapse via Exposed Trace APIs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing throughput to collapse from 4,000 req/sec to 150 req/sec, and how should the Triton ingress be secured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1495", "title": "Exploiting Debug Headers in Shared LLM Caches", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What information leak do the debug headers and cross-tenant prefix cache create, and how should you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1496", "title": "Triton Debug Port Exposure Risks", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the security risks of exposing Triton metrics and profiler ports to 10,000 engineers, and how should observability be secured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1497", "title": "Fused Residual Add and LayerNorm for HBM Bandwidth", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you analyze this bottleneck and what implementation resolves it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1498", "title": "Mitigating the Learnability Gap in Massive MLPs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should the team double the MLP to 1B parameters, or use structural inductive biases to close the learnability gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1499", "title": "Evaluating the Learnability Gap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why would a 50B flat MLP fail on satellite images despite higher capacity, and why choose the 5B ViT instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1501", "title": "Trans-Pacific ML Fraud Detection Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a centralized Virginia fraud service meet a 50ms SLA for users in Sydney when model inference takes 10ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1502", "title": "Hardware Refresh Carbon ROI", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the hydro-powered A100 cluster be replaced with 128 H100s now, considering 2-year lifecycle carbon accounting?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 2}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1503", "title": "Hardware Upgrade Carbon Breakeven", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which option minimizes total carbon over 3 years: 1,000 new H100s or 2,500 repurposed V100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1504", "title": "Datacenter Carbon Lifecycle Assessment", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total lifecycle carbon per node in Region B, and what percentage comes from embodied carbon?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1505", "title": "Why TensorRT Cannot Fix Tokyo to Virginia Latency", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can TensorRT optimization make Tokyo-to-us-east-1 inference meet the 50ms SLA, or what architectural change is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1506", "title": "LLM Fleet Capacity via Little's Law", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum total KV cache memory required to sustain 2,000 RPS at 1.5s latency with 50MB per request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1507", "title": "Diagnosing Throughput Collapse in LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using Little's Law, what concurrency is the 8xH100 LLM service hitting at 30 RPS and 5s latency, and what bottleneck explains the plateau?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1508", "title": "Global Real-Time Inference and the Light Barrier", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a centralized us-east-1 fraud service meet a 100ms P99 SLA for Tokyo and Sydney traffic with 20ms H100 inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1509", "title": "L4 vs L7 Load Balancing for gRPC Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does P99 latency spike above 100ms at 75% utilization under L4 round-robin, and what load balancing strategy should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1510", "title": "Evaluating Cluster Concurrency Limits", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a 200-request global concurrency cap meet 500 RPS at 2.5s average latency, and what concurrency should be provisioned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1511", "title": "LLM Fleet Tail Latency Debugging", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 latency hit 9s at 40% average GPU utilization, and what L7 routing policy should replace round-robin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1512", "title": "L7 Load Balancing for Speech AI", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What minimum ingress bandwidth is needed for 2,000 audio streams at 250 KB/s, and what routing algorithm minimizes P99 latency?", "chain_ids": ["cloud-chain-auto-secondary-013-30"], "chain_positions": {"cloud-chain-auto-secondary-013-30": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1513", "title": "Diagnosing P99 Spikes in LLM Fleets", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 TTFT exceed 3s at only 60% GPU utilization under round-robin, and what routing signal should the load balancer use?", "chain_ids": ["cloud-chain-auto-secondary-013-30"], "chain_positions": {"cloud-chain-auto-secondary-013-30": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1514", "title": "Disaggregated Load Balancing for LLMs", "topic": "load-balancing", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you migrate to disaggregated prefill/decode serving or overprovision the cluster, and why?", "chain_ids": ["cloud-chain-auto-secondary-013-30"], "chain_positions": {"cloud-chain-auto-secondary-013-30": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1515", "title": "Training Time Estimation with Local NVMe Caching", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long does 10-epoch training take with a local NVMe warm cache versus reading the 64TB dataset from S3 every epoch?", "chain_ids": ["cloud-chain-auto-003-11"], "chain_positions": {"cloud-chain-auto-003-11": 0}, "chain_tiers": {"cloud-chain-auto-003-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1516", "title": "Debugging Remote Storage Bottlenecks in Multi-Node Training", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is bottlenecking the job, and how can local NVMe caching raise GPU utilization above 80% after epoch 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1517", "title": "Local NVMe Caching for Multi-Epoch Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use node-local NVMe warm caching, a Redis-style distributed cache, or Lustre for the 15TB S3 vision dataset, and why?", "chain_ids": ["cloud-chain-auto-003-11"], "chain_positions": {"cloud-chain-auto-003-11": 2}, "chain_tiers": {"cloud-chain-auto-003-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1518", "title": "Diagnosing Low NVMe Cache Hits in Batch Inference", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are NVMe cache hit rates below 5% in the daily Kubernetes jobs, and how should scheduling be changed?", "chain_ids": ["cloud-chain-auto-003-11"], "chain_positions": {"cloud-chain-auto-003-11": 1}, "chain_tiers": {"cloud-chain-auto-003-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1519", "title": "Strict vs Relaxed Locality-Aware Scheduling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the cluster use strict cache-local scheduling or relaxed scheduling with remote cache reads, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1520", "title": "Delay Scheduling Break-even Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum wait time T (in seconds) the scheduler should tolerate before falling back to scheduling the job on an idle node and pulling the dataset over the network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1521", "title": "Sizing an M/G/c/K LLM Inference Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the server utilization p and average end-to-end latency for the service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1522", "title": "M/G/c/K Queue Sizing for LLM Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the M/G/c/K 70B API gateway, should K be set to 16 or 64 to balance drops against P99 wait time, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1523", "title": "Analyzing Catastrophic Forgetting in LLM Unlearning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What additional objective should the unlearning loss include to preserve MMLU while removing the 5GB corpus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1524", "title": "Analyzing P99 Latency in M/G/c/K Queues", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do p99 latency and drops violate the 2-second SLA despite 75% utilization in the LLM inference queue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1525", "title": "Evaluating LLM Unlearning Strategies", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which unlearning approach should you use for the 100M-token GDPR deletion on the 7B LLM, and how would you validate compliance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1526", "title": "SISA Exact Unlearning Cost", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the minimum GPU-hour cost to exactly unlearn records A, B, and C from the SISA ensemble?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1527", "title": "Evaluating GPU Instances for High-Batch Dense FFNs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which instance do you select to maximize throughput, and how do you justify this via the roofline model?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 4}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1529", "title": "Diagnosing Attention vs FFN Bottlenecks", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the MHA batched GEMMs show lower SM utilization than FFN layers at sequence length 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1530", "title": "Analyzing FFN Layer Bottlenecks on A100 GPUs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the FFN's arithmetic intensity during batch-1 autoregressive generation, and what execution time should the A100 achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1531", "title": "Evaluating GEMM Tiling Strategies on A100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the proposed 256x256 FP16 GEMM tile viable, and what tile size should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1532", "title": "Diagnosing MMD Monitoring Bottlenecks at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does exact RBF-kernel MMD OOM and time out for the 1M-reference, 18M-live drift job, and what scalable method should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1533", "title": "MMD Drift Detection Compute Constraint", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much compute does naive quadratic MMD need for N=M=20,000, does it fit 100 GFLOPs, and what N=M stays within budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1534", "title": "Evaluating MMD Drift Detection Scale", "topic": "compound-ai-systems", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can exact nightly MMD between 100,000 reference embeddings and 10^8 daily queries finish on one A100 in 1 hour, and what should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1535", "title": "Diagnosing GEMM Tensor Core Underutilization", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the M=1023 GEMM achieve 45 TFLOPs while the M=1024 GEMM exceeds 200 TFLOPs in FP16 PyTorch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1536", "title": "Mitigating MIA in Clinical APIs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which MIA mitigation should you choose for the 15B clinical model—DP-SGD, Top-1 truncation, or rate limiting—and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1537", "title": "Evaluating Membership Inference Attack Advantage", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the MIA TPR, FPR, and Attack Advantage, and what do they imply for leakage and mitigation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1538", "title": "Attention Matrix Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the FP16 memory footprint of the B=16, H=16, S=4096 attention score matrix, and will it fit in the 40MB L2 cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1539", "title": "Diagnosing Sequence Length OOM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does increasing sequence length to 8192 cause forward-pass OOM under ZeRO-1, and what attention implementation fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1540", "title": "Activation Checkpointing vs CPU Offloading", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you bridge the 40GB gap using Activation Checkpointing or CPU Offloading over PCIe Gen4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1541", "title": "Evaluating L40S vs H100 for LLM Decoding", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For batch-1 decoding of a 70B FP16 chatbot, should you choose 8x L40S or 4x H100 SXM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1542", "title": "Diagnosing Low Compute Utilization in Autoregressive Generation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this bottleneck and what should be the actual optimization strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1543", "title": "13B FP16 LLM Batch-1 Token Rate from Memory Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum theoretical batch-1 autoregressive generation rate for the 13B FP16 model?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 1}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1544", "title": "Calculate Autoregressive Token Generation Throughput", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the maximum theoretical token rates for the 7B FP16 model on the A10G at batch size 1 and batch size 4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1545", "title": "Batch-1 Llama-2 70B A100-to-H100 Decode Speedup", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will H100s give the expected 3.1x speedup for batch-1 Llama-2 70B decoding, and what speedup should leadership expect?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 3}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1546", "title": "70B Decode Throughput: Continuous Batching vs INT8 Matmul", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 70B decoding on 8x A100s with a 50ms TBT target, should you prioritize INT8 matmul or continuous batching with PagedAttention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1547", "title": "Roofline Analysis of LLM Decoding on A100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What arithmetic intensity and throughput should you expect from upgrading to 600 TFLOPS but the same 2.0 TB/s bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1549", "title": "Diagnosing Low Utilization in 100B DLRM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing low SM utilization and high P99 latency in the DLRM embedding workload, and what limits it?", "chain_ids": ["cloud-chain-auto-012-09"], "chain_positions": {"cloud-chain-auto-012-09": 1}, "chain_tiers": {"cloud-chain-auto-012-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1550", "title": "Sizing GPU Clusters for DLRM Embeddings", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many A100 80GB GPUs are minimally required to serve the 10B-entry DLRM embeddings plus the 50GB overhead?", "chain_ids": ["cloud-chain-auto-012-09"], "chain_positions": {"cloud-chain-auto-012-09": 0}, "chain_tiers": {"cloud-chain-auto-012-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1551", "title": "Evaluate Memory Locking vs Swap Disabling for P99 Latency", "topic": "compound-ai-systems", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you disable swap or use mlockall for the 20GB embedding service on the 32GB VM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1552", "title": "Mitigating Latency Jitter with Memory Locking", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What latency penalty does 10MB of swapped-out scattered weights add to inference, and what application-level fix prevents it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1553", "title": "DLRM Latency Spikes Post-Traffic Lull", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 800ms P99 burst-latency spikes on the 35GB DLRM VM, and what system-level fix should you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1554", "title": "DLRM Tiered Memory Architecture", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you shard the 800B-parameter DLRM purely in GPU HBM or use HBM plus host DDR5 for embeddings, and why?", "chain_ids": ["cloud-chain-auto-012-09"], "chain_positions": {"cloud-chain-auto-012-09": 2}, "chain_tiers": {"cloud-chain-auto-012-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1555", "title": "Calculating Baseline Training Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the absolute minimum baseline memory footprint required per accelerator just to store the weights, gradients, and optimizer states?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1556", "title": "Static Memory Planning Buffer Reuse", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What peak activation memory does static liveness-based planning require, and how much memory is saved compared to naive allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1557", "title": "Diagnosing p99 Latency Spikes from Dynamic Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can graph compilation and memory planning eliminate this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1558", "title": "Evaluating Static Memory Planning vs Dynamic Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you choose between dynamic allocation, static max-shape planning, and bucketed static planning for serving this variable-length 13B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1559", "title": "Diagnosing OOM in 7B LLM Full Fine-Tuning", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does full fine-tuning the 7B FP16 model OOM on an 80GB GPU even with batch size 1, and what should you do?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1560", "title": "LLM Decoding on A100 Roofline", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What arithmetic intensity does the A100 need, how does batch-1 decoding compare, and what serving change fixes the gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1561", "title": "Small File Metadata Overhead in Object Storage", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What throughput limit does 20ms per-file S3 latency impose, and how should you restructure the 10M-image dataset to fix it?", "chain_ids": ["cloud-chain-auto-011-13"], "chain_positions": {"cloud-chain-auto-011-13": 0}, "chain_tiers": {"cloud-chain-auto-011-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1562", "title": "Diagnosing Sublinear Inference Scaling on H100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did H100 batch-1 LLM decoding improve only ~1.6x instead of >3x, and how can you escape that bottleneck?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 2}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1563", "title": "Diagnosing Small File Metadata Starvation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the A100s starved despite 20GB/s storage bandwidth, and how should the 10M JPEGs be stored instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1564", "title": "Prometheus Cardinality Explosion", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you redesign the observability stack to support tenant-level billing without crashing the metrics server?", "chain_ids": ["cloud-chain-auto-004-16"], "chain_positions": {"cloud-chain-auto-004-16": 0}, "chain_tiers": {"cloud-chain-auto-004-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1565", "title": "Evaluating Storage Formats for Small-File Datasets", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should 100M 50KB image chips be stored as individual JPEGs for 10k images/s training, and what architecture should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1566", "title": "Diagnosing Observability OOM Cascades", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are the pods running out of memory, and how do you fix it?", "chain_ids": ["cloud-chain-auto-004-16"], "chain_positions": {"cloud-chain-auto-004-16": 1}, "chain_tiers": {"cloud-chain-auto-004-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1567", "title": "Managing High Metric Cardinality in Global Model Serving", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate the metric design and what tradeoffs must you make to stabilize the TSDB without losing visibility?", "chain_ids": ["cloud-chain-auto-004-16"], "chain_positions": {"cloud-chain-auto-004-16": 2}, "chain_tiers": {"cloud-chain-auto-004-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1569", "title": "Evaluating Unified Network Fabrics for Mice and Elephant Flows", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 400Gbps Clos fabric use flow-level DLB or cell-based packet spraying to protect LLM training from web-service mice flows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1570", "title": "Optimizing Arithmetic Intensity with Mini-Batching", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum mini-batch size makes the 4096x4096 FP16 layer compute-bound on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1571", "title": "Diagnosing p99 Latency Spikes in Mixed-Flow Networks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does p99 inference latency spike during 5GB ingestion bursts even though ToR utilization averages only 40%, and how should you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1572", "title": "Estimating Mixed-Precision Memory Footprint", "topic": "mixed-precision-training", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the actual static memory footprint for 10B parameters under standard AMP with Adam, and can it fit on one GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1573", "title": "Diagnosing Mixed-Precision OOM Failures", "topic": "mixed-precision-training", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does FP16 mixed-precision AdamW for a 7B model OOM before the first forward pass under DDP, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-01"], "chain_positions": {"cloud-chain-auto-secondary-015-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1574", "title": "Diagnosing Low GPU Utilization for a 350M Transformer on A100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the root cause of this underutilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1575", "title": "Evaluating FP16 vs BF16 for LLM Pre-training Stability", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 30B LLM training run switch from FP16 with loss scaling to BF16, and what trade-offs justify the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1576", "title": "Evaluating Mini-Batch Size on A100", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is batch size 16 a good systems choice for the MLP, and what batch size would better match the hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1577", "title": "MoE Active Parameter Bandwidth", "topic": "mixture-of-experts", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming FP16 precision and a batch size of 1, how much GPU memory is needed for weights and how much memory bandwidth is consumed to decode a single token?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 1}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1578", "title": "Diagnosing MoE Routing and OOM Bottlenecks", "topic": "mixture-of-experts", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do specific GPUs OOM and tail latency spike under expert parallelism, and how should you fix the 8x7B MoE serving path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1579", "title": "Evaluating MoE Serving Topologies on H100 Nodes", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which parallelization topology should you use for the 8x22B MoE on 8x GPUs, and why is it better than pure TP or pipeline parallelism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1580", "title": "Architecting a Multi-Tier Storage Hierarchy for H100 Clusters", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 5PB image pipeline use a centralized all-flash file system or local NVMe caching to reach 250GB/s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1581", "title": "Diagnosing NVMe Cache Thrashing in CV Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did adding 4TB NVMe caches per node barely improve utilization, and what data access pattern should replace global random shuffling?", "chain_ids": ["cloud-chain-auto-secondary-007-01"], "chain_positions": {"cloud-chain-auto-secondary-007-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1582", "title": "Multi-Tier Pre-fetch Sizing for 3D ViT", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the node's data ingestion rate, can it stream directly from 5GB/s S3, and how long to prefetch a 2TB epoch to NVMe?", "chain_ids": ["cloud-chain-auto-secondary-007-01"], "chain_positions": {"cloud-chain-auto-secondary-007-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1583", "title": "Production-Weighted Model Quality Benchmarking", "topic": "extreme-quantization", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the INT4 model's production-weighted accuracy, and what RAG accuracy is needed to keep degradation under 2% relative to the baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1584", "title": "MLPerf Time-to-Train Cost Estimation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming linear scaling, what are the estimated training time and cost for each vendor, and which instance is more cost-effective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1585", "title": "Evaluating FP8 Trade-offs in MLPerf Time-to-Train", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt FP8 for the MLPerf LLM submission given 1.2s steps but 70,000 convergence steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1586", "title": "Diagnosing Activation Outliers in INT8 Quantized LLMs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did W8A8 quantization look fine on MMLU but fail on long-context support queries, and what is the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1587", "title": "Model Card Disaggregated Performance SLA", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is Group C's disaggregated accuracy, and does the clinical triage model meet the 85% subgroup SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1588", "title": "Large Batch MLPerf Convergence Analysis", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did MLPerf Time to Train regress after doubling the batch size despite higher hardware throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1589", "title": "Root-Causing Deployment Failures via Disaggregated Evaluation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you root-cause this systemic failure, and what specific statistical artifacts were missing from the Model Card?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1590", "title": "Benchmarking Quantization Degradation in Generative LLMs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did offline quantization benchmarks miss the code-completion regression, and how should you benchmark INT8 model quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1591", "title": "40B Model Cold-Start Latency Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum theoretical cold-start latency to load the 40B FP16 model into HBM over the 100Gbps network and PCIe Gen5?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1592", "title": "Diagnosing LLM Cold-Start Loading Bottlenecks", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What structural bottlenecks cause the 4.5-minute cold start for the 140GB LLM, and which loading steps dominate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1593", "title": "Trade-offs in Disaggregated Performance vs Latency", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can these two medical triage LLMs meet under 200ms average TTFT and F1 at least 0.92 for each demographic without protected-attribute routing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1594", "title": "Evaluating Storage Backends for LLM Cold-Starts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which storage backend would you choose to load 350GB of FP16 weights within a strict 45-second cold-start SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1595", "title": "Dynamic Ensemble Compute vs Memory", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-request TFLOPS and active VRAM requirements for soft routing (all 4 outputs) versus top-1 hard routing across the 4 experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1596", "title": "Diagnosing Dynamic Ensemble Latency on A10G", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 685ms P99 latency with only 20% GPU utilization, and how would you fix it on 24GB instances?", "chain_ids": ["cloud-chain-auto-secondary-005-08"], "chain_positions": {"cloud-chain-auto-secondary-005-08": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1597", "title": "Calculating MFU for LLM Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Model FLOPS Utilization for this 20B LLM training run on 256 GPUs with a 12.0-second step time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1598", "title": "Evaluating Dynamic Ensembles for Fraud Detection", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which serving architecture should you choose for the 3 specialized 13B fraud models to meet <100ms P99 at 10,000 QPS, and why?", "chain_ids": ["cloud-chain-auto-secondary-005-08"], "chain_positions": {"cloud-chain-auto-secondary-005-08": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1600", "title": "Diagnosing TensorRT Dynamic Shape Spikes", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the TensorRT engine spike to 200ms and OOM on sequence lengths 32–512, and how should the dynamic shapes be configured?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1601", "title": "TensorRT Format Optimization Latency Calculation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected latency after applying FP16 and TensorRT fusion?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1602", "title": "Black-Box Model Inversion Query Cost", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many queries, how much time, and how much cost would 100 steps require, and what API output mitigation would disrupt the attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1603", "title": "Evaluating TensorRT vs ONNX for Dynamic Shapes", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you optimize this dynamic-shape BERT-Large workload with ONNX Runtime or TensorRT, and how would you minimize P99 latency jitter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1604", "title": "Evaluating MFU for LLM Training on A100 Cluster", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate the current MFU to justify this decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1605", "title": "Cloud API Model Inversion Vulnerability", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What API design flaw enables the 50,000-query face reconstruction attack, and how would you mitigate it without retraining?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1606", "title": "Diagnosing PII Leakage in Fine-Tuned LLMs", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did exact-string deduplication fail to prevent PII memorization, and what pipeline changes would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1607", "title": "Mitigating PII Memorization in Clinical LLMs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which mitigation best prevents memorization under the compute budget: deduplication, DP-SGD, or RLHF refusal guardrails?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1608", "title": "Diagnosing Inter-Node TP Bottlenecks", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you place tensor and pipeline parallelism across the two nodes to eliminate the cross-node all-reduce bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1609", "title": "Calculating Minimum Model Parallelism Degree", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum tensor-parallel degree fits the 30B mixed-precision Adam training state on 40GB GPUs if ZeRO/FSDP and PP are disabled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1610", "title": "Mitigating Black-Box Model Inversion on Health APIs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defense should you choose to stop the 50,000-query inversion attack while keeping latency under 50ms and accuracy loss under 1.5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1611", "title": "Evaluating TP and PP Placement for 175B LLMs", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you place Tensor Parallelism and Pipeline Parallelism across the nodes to maximize MFU for the 175B FP16 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1612", "title": "LLM Quantization Trade-offs on A100", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you deploy the 70B LLM as FP16 across 2 GPUs or INT4 on 1 GPU, and why for decoding throughput and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1613", "title": "GPU Sizing for 70B Model Sharding", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many A10G GPUs are required to tensor-shard the 70B FP16 LLM with 20 GiB total KV cache and activation memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1614", "title": "LLM Inference Throughput with INT8 Quantization", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum token generation rates for the 30B model on one GPU using FP16 versus INT8 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1615", "title": "Diagnosing Asymmetric OOM in Tensor Parallel Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 8-way tensor-parallel 70B model OOM only on GPU 0 during batch-128, sequence-2048 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1616", "title": "LLM Inference Sharding Strategy", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use TP=8, PP=8, or a hybrid to shard the 175B FP16 model on one 8x A100 node for <200ms TTFT, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1617", "title": "Quantization Speedup Failure at Large Batch", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does W8A8 quantization improve token throughput by less than 5% at batch size 512 despite double the theoretical INT8 peak TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1618", "title": "Verifying LLM Output Watermarks via Z-Score", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What Z-score does 260 green tokens out of 400 produce with γ=0.5, and does it exceed the legal threshold of Z=4.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1619", "title": "Evaluating LLM Watermark Sequence Requirements", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the watermark undetectable on 20-token responses, and what minimum sequence length is needed to reach Z >= 4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1620", "title": "MIG Profile Packing and Resource Fragmentation", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 1g.5gb MIG profiles can Tenant C get after the 4g.20gb and 2g.10gb allocations, and what resource limits further allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1621", "title": "Batched PRNG Collisions in Logit Watermarking", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the watermark z-score collapse from >6 at batch size 1 to ~0.1 under continuous batching with B=64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1622", "title": "Diagnosing Throughput Drop Migrating from MPS to MIG", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did moving from MPS to MIG cause this specific performance degradation, and what is the root cause bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1623", "title": "Global Gateway Rollback Impact Calculation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many requests experience degraded latency during the 45-second routing propagation and 60-second linear drain rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1624", "title": "MIG Compute Partitioning vs. Latency SLAs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is partitioning the A100 into 7x 1g.10gb MIG profiles feasible for seven 8GB services with a 50ms P99 SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1625", "title": "Artifact Sync Race Condition in Multi-Region Canary", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused eu-central ModelNotFound errors during the 5% canary rollout, and how would you redesign the deployment protocol?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1626", "title": "Global Large Model Rollout and Rollback Evaluation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you distribute and roll back 60GB model weights across 3 regions to guarantee a sub-60-second global rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1627", "title": "Diagnosing Intersectional Calibration Failures", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically diagnose and correct localized miscalibrations without manually auditing all permutations or overfitting on small slices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1628", "title": "A/B Testing FWER Correction", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many false discoveries are expected across 20 models and 4 metrics at alpha=0.05, and what Bonferroni alpha controls FWER at 0.05?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1629", "title": "Diagnosing A/B Test False Positives", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 18 seemingly significant model promotions degrade revenue despite positive A/B dashboards across 50 variants and 10 metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1630", "title": "Evaluating Multicalibration in Risk Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use iterative multicalibration post-processing or a subgroup MoE for the clinical model under a 50ms P99 budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1633", "title": "Theoretical Minimum MatMul Latency", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum execution time for the FFN up-projection Y=XW on this GPU at 312 TFLOPs FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1634", "title": "Diagnosing Tail Latency in RESTful Image Serving", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 200ms p99 latency and low GPU utilization, and how would you fix the request path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1635", "title": "Dynamic Batching Network Latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the max theoretical throughput and end-to-end latency for a request that waits the full 10ms batch timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1636", "title": "MAC Utilization in Memory-Bound Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator, A or B, gives lower batch-1 generation latency per token, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1637", "title": "NPU Thermal Constraints and Edge Compute Limits", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum sustainable frame rate can the CPU and NPU each achieve within the 4W ML thermal envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1638", "title": "Analyzing Edge NPU Graph Compiler Fallbacks", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of the 120ms edge latency spike after replacing ReLU with Swish, despite dropping NPU utilization?", "chain_ids": ["cloud-chain-auto-secondary-015-10"], "chain_positions": {"cloud-chain-auto-secondary-015-10": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1639", "title": "Diagnosing Low MAC Throughput in Custom Kernels", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FP16 custom CUDA kernel top out at about 8 TFLOPS instead of using the T4's 65 TFLOPS Tensor Cores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1640", "title": "Evaluating Networked Serving vs Embedded Deployment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you extract the image embedding model into a Triton microservice, how many T4 GPUs are needed at 400 QPS, and what max batch delay meets 50ms p99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1641", "title": "Cloud-to-Edge NPU Offloading Architecture", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you move the 1,000-camera video analytics workload to 4 TOPS edge NPUs, and what compute/TCO trade-offs drive the decision?", "chain_ids": ["cloud-chain-auto-secondary-015-10"], "chain_positions": {"cloud-chain-auto-secondary-015-10": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-10": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1642", "title": "Dual-Socket NUMA Dataloader Bandwidth", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much cross-socket bandwidth do GPUs 4-7 need if staging memory is accidentally allocated on Socket 0, and how would you fix the NUMA issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1643", "title": "Diagnosing GPU Starvation via Cross-Socket Memory Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is starving the A100 GPUs despite 40 GB/s storage bandwidth, and how would you fix the NUMA placement?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1644", "title": "Evaluating NUMA-Aware Data Loading Bottlenecks", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes GPUs 4-7 to run at 30% lower utilization, and how should you redesign data-loader CPU and memory affinity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1645", "title": "Diagnosing High Tail Latency in Dual-Socket CPU Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you root-cause and resolve this scaling bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-017-20"], "chain_positions": {"cloud-chain-auto-secondary-017-20": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1646", "title": "Optimizing Dual-Socket Memory Bandwidth", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What effective memory bandwidth does the shared Node-0 model get, and how should you reconfigure the workers for NUMA locality?", "chain_ids": ["cloud-chain-auto-secondary-017-20"], "chain_positions": {"cloud-chain-auto-secondary-017-20": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1647", "title": "Evaluating TP and PP Placement Across NVLink", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where should tensor parallelism and pipeline parallelism be placed across H100 nodes to minimize communication bottlenecks, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1648", "title": "NVLink 4.0 Activation Transfer Time", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum time to transfer the 45GB activation from GPU 0 to GPU 1 over NVLink 4.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1649", "title": "Direct S3 Streaming Bandwidth for A100 Clusters", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What network bandwidth and how many concurrent HTTP byte-range requests are required per 8-GPU node to stream the 1MB TFRecords?", "chain_ids": ["cloud-chain-auto-secondary-015-12"], "chain_positions": {"cloud-chain-auto-secondary-015-12": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1650", "title": "NVLink to PCIe Fallback", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What likely causes the ~14x slow intra-node ncclAllGather, and how would you verify and fix the topology issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1651", "title": "Object Storage Streaming vs POSIX Systems", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 5PB training dataset be copied to Lustre or streamed directly from object storage, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-12"], "chain_positions": {"cloud-chain-auto-secondary-015-12": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1652", "title": "Defensive WAF Stress-Test Throughput Architecture", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy can meet 5,000 synthetic WAF test cases/sec on the cluster, white-box surrogate or black-box RL, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1653", "title": "Diagnosing Object Storage Prefix Rate Limits in Streaming", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does S3 streaming flatline at exactly 11 GB/s despite 50 GB/s instance networking, and how should the dataset be laid out?", "chain_ids": ["cloud-chain-auto-secondary-015-12"], "chain_positions": {"cloud-chain-auto-secondary-015-12": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1655", "title": "LLM-Automated Spear Phishing Scale", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 200-token personalized phishing emails can the 50-node T4 botnet generate in 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1656", "title": "Online Learning Backlog Catch-Up Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long will it take to process the accumulated backlog and catch up to the live stream once restored?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1657", "title": "Diagnosing OOM in DLRM Online Learning", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing VRAM to grow from 10GB to 24GB over 48 hours despite constant event volume and cleared computation graphs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1658", "title": "Continuous Adaptation for DLRM under Drift", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should online learning update the full 101GB DLRM or only the 1GB dense MLP while embeddings update daily, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1659", "title": "Debugging Block-wise Softmax for Long-Context Attention", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose and fix the tile-based softmax computation to match exact attention?", "chain_ids": ["cloud-chain-auto-008-13"], "chain_positions": {"cloud-chain-auto-008-13": 1}, "chain_tiers": {"cloud-chain-auto-008-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1660", "title": "Estimating ONNX Runtime INT8 Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What inference latency should you expect after enabling DNNL and dynamic INT8 quantization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1661", "title": "Diagnosing ONNX Runtime Graph Fragmentation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose and resolve this performance bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1662", "title": "Evaluating Online Softmax for Long-Context Kernels", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is block-based online softmax strictly required for 65,536-token attention to avoid the memory wall compared to HBM-materialized or naive 1-pass softmax?", "chain_ids": ["cloud-chain-auto-008-13"], "chain_positions": {"cloud-chain-auto-008-13": 2}, "chain_tiers": {"cloud-chain-auto-008-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1663", "title": "Online Softmax Memory Footprint Calculation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much HBM is needed for the 65,536 x 65,536 FP16 attention matrix, and how much SRAM for one 128-query block's online-softmax stats?", "chain_ids": ["cloud-chain-auto-008-13"], "chain_positions": {"cloud-chain-auto-008-13": 0}, "chain_tiers": {"cloud-chain-auto-008-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1665", "title": "Diagnosing Carbon Footprint Spikes in Hardware Upgrades", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did operational carbon increase after moving the 10-day A100 job to the faster H100 Midwest cluster, and by how much?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1666", "title": "Estimating LLM Training Operational Carbon", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total operational carbon emissions will the 128-node training run produce over 30 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1667", "title": "Output Stationary SRAM Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact reduction in psum SRAM accesses achieved by an Output Stationary dataflow compared to reading and writing the psum for every MAC operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1668", "title": "LLM Training Region Carbon Footprint Evaluation", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which region must you choose to stay under the 50-ton CO2 budget, and what is the financial trade-off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1669", "title": "Diagnosing Write-Bound Systolic Arrays", "topic": "extreme-quantization", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the architectural root cause of this inverted memory profile, and how should you reconfigure the dataflow to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1670", "title": "Calculating Paged KV Cache Capacity", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many concurrent requests can PagedAttention support with 512-token average sequences versus static 2048-token KV allocation?", "chain_ids": ["cloud-chain-auto-019-02"], "chain_positions": {"cloud-chain-auto-019-02": 0}, "chain_tiers": {"cloud-chain-auto-019-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1671", "title": "Dataflow Trade-offs for Cloud Accelerators", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the accelerator use Output Stationary or Weight Stationary dataflow for layers with many partial sums, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1672", "title": "Diagnosing Low Batch Size in Paged KV Cache", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing OOMs in the Paged KV cache with 1024-token blocks on a mixed 150-token and 2000-token workload?", "chain_ids": ["cloud-chain-auto-019-02"], "chain_positions": {"cloud-chain-auto-019-02": 1}, "chain_tiers": {"cloud-chain-auto-019-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1673", "title": "Evaluating Paged KV Cache Block Sizes", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Paged KV cache block size, 16 or 256 tokens, should you choose for Llama-2-70B, and why?", "chain_ids": ["cloud-chain-auto-019-02"], "chain_positions": {"cloud-chain-auto-019-02": 2}, "chain_tiers": {"cloud-chain-auto-019-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1674", "title": "Paged KV Cache Capacity Calculation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected increase in concurrent request capacity with 16-token paged KV cache allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1675", "title": "Evaluating Block Sizes in Paged Memory Management", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which PagedAttention block size (1, 16, or 256 tokens) would you choose, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1676", "title": "Evaluating PagedAttention for LLM Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What impact will migrating to 16-token PagedAttention blocks have on memory utilization and maximum batch size?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1677", "title": "Calculating PAM4 Baud Rate for 800G Links", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What actual symbol rate per lane is required to support the 800G link over 8 PAM4 lanes with 6.25% overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1678", "title": "Diagnosing PagedAttention Block Size Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the original OOMs, and how should you choose a PagedAttention block size between 256 and 8 tokens?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 1}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1679", "title": "Diagnosing PAM4 FEC Errors in 400G Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the root cause of this performance degradation in the PAM4 signaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1680", "title": "KV Cache Fragmentation Bottleneck", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the OOMs above 4 requests, and what concurrency should 16-token paged KV allocation support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1681", "title": "Calculating PagedAttention Memory Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory is wasted by contiguous KV allocation, and how much is consumed with 16-token PagedAttention blocks?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 0}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1682", "title": "Identifying Dominated Models on the Pareto Frontier", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the fundamental Pareto-frontier flaw in deploying Model Z over Model X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1683", "title": "Evaluating PAM4 Transceivers for 800G AI Fabrics", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt 100G PAM4 over 8 lanes for 800G ports, and what trade-offs must you budget for?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1684", "title": "Evaluating Pareto Optimal Models for Fraud Detection", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which models are Pareto-optimal, which are dominated, and how should the 100ms SLA affect the final choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1685", "title": "PCIe Gen5 Transfer Latency", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum PCIe transfer time per step for 8GB, and why will the effective time be higher?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1686", "title": "Evaluating PCIe Bottlenecks in ZeRO-Offload", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What PCIe lower bound does this ZeRO-3 offload design impose per optimizer step, and is it viable for a 5-second iteration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1687", "title": "Diagnosing PCIe Host-to-Device Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a 100ms step time physically impossible when fetching 8GB over a single PCIe Gen5 x16 link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1688", "title": "Pipeline Bubble Fraction Calculation", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum global batch size gives 10% or lower GPipe bubble overhead with p=8 and microbatch size 4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1689", "title": "Diagnosing PCIe DataLoader Bottlenecks", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the CPU bottlenecked, and how do you achieve true asynchronous transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1690", "title": "Evaluating Massive Pinned Memory Allocations", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you pin the entire 400GB embedding table in host memory, and what architecture would you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1691", "title": "Diagnosing Low Utilization in Pipeline Parallelism", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 55% SM utilization, and what exact fraction of compute time is wasted by pipeline bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1692", "title": "Minimizing Pipeline Bubble Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum valid micro-batch count keeps 1F1B idle time below 10%, and what maximum micro-batch size follows?", "chain_ids": ["cloud-chain-auto-023-16"], "chain_positions": {"cloud-chain-auto-023-16": 0}, "chain_tiers": {"cloud-chain-auto-023-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1693", "title": "Calculate DMA Transfer Speedup with Pinned Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What transfer time and bandwidth improvement should you expect after enabling pin_memory=True for 800MB batches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1694", "title": "Accelerator Pipeline Throughput Calculation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the steady-state throughput in tiles per 100,000 clock cycles, and what is the utilization of the memory load unit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1695", "title": "GPipe vs 1F1B Schedule Trade-offs in LLM Training", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use flush-based GPipe or 1F1B for 512 microbatches, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1696", "title": "Diagnosing High Pipeline Bubble Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you change the microbatching to reduce PP bubble overhead and exceed 85% utilization without changing global batch size?", "chain_ids": ["cloud-chain-auto-023-16"], "chain_positions": {"cloud-chain-auto-023-16": 1}, "chain_tiers": {"cloud-chain-auto-023-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1697", "title": "Optimizing Pipeline Bubble in 175B Model Training", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you increase to 256 microbatches or use interleaved 1F1B with v=4 to cut the PP bubble below 10%, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1698", "title": "Overlapping CPU Preprocessing, H2D Transfer, and GPU Compute to Double Inference Throughput", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can you restructure the CPU preprocessing, H2D transfer, and GPU compute pipeline to double throughput on the T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1699", "title": "Evaluate Pipelining vs Batch Scaling", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which design maximizes throughput: CUDA-stream pipelining at batch 64 or sequential batch 128, and what throughput does it achieve?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1700", "title": "Evaluating Drift Metrics for High-Volume Credit Scoring", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use K-S tests or PSI to monitor drift for 10M daily categorical and binned features, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1701", "title": "Calculating Feature Drift with Population Stability Index", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the PSI for the income-bracket shift from 80/20 to 50/50, and does it exceed the 0.2 drift threshold?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1702", "title": "Compute Overhead of KernelSHAP Explanations", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many dedicated CPU cores are required to generate KernelSHAP explanations for 10% of 500 requests/sec without queuing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1703", "title": "Diagnosing KernelSHAP Latency Bottlenecks", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottleneck is causing the KernelSHAP timeouts, and how should you reduce latency below 1 second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1704", "title": "Diagnosing False PSI Alerts for Income Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the PSI alert on Annual Income, and what immediate monitoring change should you make?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1705", "title": "4-Bit PTQ Memory Footprint for LLMs", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 4-bit AWQ weight footprint including metadata, and will the 30B model fit on a 24GB VRAM GPU?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1706", "title": "Evaluating SHAP vs LIME for Real-Time Fraud", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which explainer—KernelSHAP, TreeSHAP, or LIME—should you use to meet the 50ms inference-plus-explainability budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1707", "title": "Debugging INT8 PTQ Degradation in 70B LLMs", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the W8A8 PTQ perplexity collapse, and what targeted quantization fix would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1708", "title": "Evaluating PTQ Strategies for 70B LLM Serving", "topic": "extreme-quantization", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use W8A8 or W4A16 PTQ to fit the 70B LLM on one 80GB A100 for high-throughput serving, and why?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1709", "title": "Cloud RAPL Power Side-Channel Analysis", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you mitigate software power side-channel attacks against the co-hosted LLM service?", "chain_ids": ["cloud-chain-auto-015-05"], "chain_positions": {"cloud-chain-auto-015-05": 0}, "chain_tiers": {"cloud-chain-auto-015-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1710", "title": "Diagnosing Multi-Tenant GPU Power Side-Channel Leaks", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the MIG tenant's architecture leak through NVML telemetry, and how would you mitigate it?", "chain_ids": ["cloud-chain-auto-015-05"], "chain_positions": {"cloud-chain-auto-015-05": 1}, "chain_tiers": {"cloud-chain-auto-015-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1711", "title": "Liquid Cooling Trade-offs at the Power Density Wall", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you pay the $15M DLC premium or deploy one node per air-cooled rack, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1712", "title": "Rack Power Density Capacity Calculation", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 42U racks are needed for 16 DGX H100 nodes under a 24kW rack limit, and what space utilization results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1713", "title": "Diagnosing PDU Trips in High-Density GPU Racks", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 15 kW PDU trip despite the average draw being only 9 kW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1714", "title": "Mitigating Power Analysis in Multi-Tenant GPU Inference", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you mitigate 1ms GPU power-telemetry leakage, and how do dummy work, power caps, and SEV-SNP compare?", "chain_ids": ["cloud-chain-auto-015-05"], "chain_positions": {"cloud-chain-auto-015-05": 2}, "chain_tiers": {"cloud-chain-auto-015-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1715", "title": "Calculating Total Facility Energy for Training Cluster", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many MWh will the 1,024-GPU cluster consume at the facility over the 30-day peak-utilization run?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 2}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1716", "title": "Data Center PUE vs Grid Carbon Intensity", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which colocation site should you choose, and what are the annual cost and carbon emissions for each facility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1717", "title": "Diagnosing PUE Degradation in Liquid Cooling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the new PUE after the liquid-cooling retrofit, why does it worsen, and is the cluster actually more sustainable?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 3}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1718", "title": "Rack-Level Power Wall Calculation for AI Clusters", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the proposed 10-node rack’s peak power draw, and how many 8-GPU nodes fit under the 40 kW rack limit?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 2}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1719", "title": "LLM KV Cache Waste with Static Pre-allocation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With static 2048-token KV allocation on a 24GB RTX 3090, what is the wasted memory per full batch and maximum batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1720", "title": "Diagnosing Sustained Training Throughput Drops on H100 Nodes", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 15-20% throughput drop after 20 minutes of stable training, and how can it be mitigated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1721", "title": "Evaluating H100 Cluster Power and Cooling Topologies", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you choose sparse air cooling or dense liquid cooling for the 1,024-GPU H100 cluster, and how do power limits affect networking, TCO, and throughput?", "chain_ids": ["cloud-chain-auto-015-01"], "chain_positions": {"cloud-chain-auto-015-01": 4}, "chain_tiers": {"cloud-chain-auto-015-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1722", "title": "Evaluating KV Cache Pre-allocation Waste", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much KV cache memory does 4096-token pre-allocation waste per request, and why should the 13B serving engine use dynamic allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1723", "title": "LLM Decode Speedup via Weight Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical decode tokens per second do FP16, INT8, and INT4 weight formats achieve on 16 TB/s of A100 memory bandwidth?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 2}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1724", "title": "Diagnosing LLM KV Cache Pre-allocation Waste", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory bottleneck limits the 70B service to 16 concurrent requests, and how large is the KV cache utilization gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1725", "title": "W8A8 vs W4A16 for LLM Decoding", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For batch-16 decoding of the 70B model, would you choose W8A8 or W4A16 quantization, and why?", "chain_ids": ["cloud-chain-auto-014-12"], "chain_positions": {"cloud-chain-auto-014-12": 1}, "chain_tiers": {"cloud-chain-auto-014-12": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1726", "title": "Diagnosing W8A16 Quantization Regression During Prefill", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does W8A16 nearly double batch-1 decode speed but make 2048-token prefill 15% slower than FP16?", "chain_ids": ["cloud-chain-auto-014-12"], "chain_positions": {"cloud-chain-auto-014-12": 0}, "chain_tiers": {"cloud-chain-auto-014-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1727", "title": "Diagnosing Preemption Throughput Collapse", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the throughput collapse and GPU starvation when preemptively swapping 32K token KV caches over PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1728", "title": "Evaluating Swap vs Recompute in Preemptive Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For preempting 16 batch tasks averaging 4000 tokens, should you swap KV cache over PCIe Gen4 or discard and recompute it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1729", "title": "KV Cache Preemption Swap Latency", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What total PCIe latency penalty does swapping out and back the KV cache for 32 requests at 2,000 tokens incur?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1730", "title": "Calculating Optimal Prefetch Buffer Depth for I/O Jitter", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How deep must the host DRAM prefetch buffer be to cover 1.2s I/O spikes without stalling 150ms GPU steps?", "chain_ids": ["cloud-chain-auto-003-02"], "chain_positions": {"cloud-chain-auto-003-02": 0}, "chain_tiers": {"cloud-chain-auto-003-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1731", "title": "Diagnosing GPU Starvation from P99 I/O Jitter", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the GPU stalling, and how deep must the prefetch buffer be to guarantee zero starvation?", "chain_ids": ["cloud-chain-auto-003-02"], "chain_positions": {"cloud-chain-auto-003-02": 1}, "chain_tiers": {"cloud-chain-auto-003-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1732", "title": "LLM Inference Phase Bottlenecks", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the arithmetic intensities for a 2048-token prefill and single-token decode, and which hardware limit bounds each phase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1733", "title": "Evaluating Prefetch Buffer Depth", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use a 2-batch prefetch buffer or size it for the 600ms p99.9 tail, and what depth prevents GPU starvation?", "chain_ids": ["cloud-chain-auto-003-02"], "chain_positions": {"cloud-chain-auto-003-02": 2}, "chain_tiers": {"cloud-chain-auto-003-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1734", "title": "Estimating Prefix Caching Savings", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much KV cache memory is saved when 256 requests share the same 2048-token system prompt via prefix caching?", "chain_ids": ["cloud-chain-auto-008-06"], "chain_positions": {"cloud-chain-auto-008-06": 0}, "chain_tiers": {"cloud-chain-auto-008-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1735", "title": "Evaluating Split-Pool vs Chunked Prefill", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the service use split-pool serving or chunked prefill to protect the 50ms/token decode latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1736", "title": "Diagnosing Zero-Hit Prefix Caching", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is automatic prefix caching hitting 0% with the SessionID and Date prefix, and how should the prompt be reformatted?", "chain_ids": ["cloud-chain-auto-008-06"], "chain_positions": {"cloud-chain-auto-008-06": 1}, "chain_tiers": {"cloud-chain-auto-008-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1737", "title": "Diagnosing Bottlenecks in LLM Serving", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes TTFT/TBT spikes when long prefills and decodes share continuous batching, and what serving architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1738", "title": "Prefix Caching Trade-offs in Agentic LLM Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you add global prefix caching for the 90% shared 4,000-token prompt, and when would cache overhead outweigh its benefits?", "chain_ids": ["cloud-chain-auto-008-06"], "chain_positions": {"cloud-chain-auto-008-06": 2}, "chain_tiers": {"cloud-chain-auto-008-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1739", "title": "Resolving Image Pipeline Preprocessing Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why will upgrading to a faster GPU fail to meet the 50ms latency budget, and what preprocessing fix meets it?", "chain_ids": ["cloud-chain-auto-secondary-016-03"], "chain_positions": {"cloud-chain-auto-secondary-016-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1740", "title": "Diagnosing End-to-End Latency in Image Serving", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you quantize the ResNet-50 model to INT8, or optimize the 40ms CPU preprocessing bottleneck first?", "chain_ids": ["cloud-chain-auto-secondary-016-03"], "chain_positions": {"cloud-chain-auto-secondary-016-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1741", "title": "PFC Buffer Headroom Calculation for 400GbE RoCEv2", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum PFC buffer headroom per 400GbE port is needed to absorb in-flight packets after a pause trigger?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1742", "title": "Image Preprocessing Bottleneck in GPU Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Will upgrading the GPU meet the 15ms SLO, and what architecture is more cost-effective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1743", "title": "Diagnosing RoCEv2 PFC Buffer Drops", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the RoCEv2 collapse despite PFC, and how should you tune buffers and congestion control?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1744", "title": "Diagnosing P0 Inference Starvation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the P0 job starved, and how do you resolve this at a system level?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1745", "title": "Evaluating PFC Thresholds in 400G RoCEv2", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you lower the PFC XOFF watermark from 50% to 10%, and what systemic trade-offs does that create for RoCEv2 All-to-All traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1746", "title": "Network Bandwidth Priority Inversion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What delay does J_H experience while J_L writes 500GB at only 10Gbps, and what scheduling anomaly causes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1747", "title": "Priority Inversion in GPU Fleet Orchestration", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you resolve the P0 autoscaler blocked by a P2 lock holder starved by P1 work: priority inheritance, killing P2, or lock timeouts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1748", "title": "Data-Parallel Process Group Misalignment in 3D Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the DP AllReduce running over all 1024 ranks instead of the 8-rank DP group, and what process-group fix is needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1749", "title": "PE Array Utilization and Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the accelerator’s peak TFLOPS, actual PE utilization, and primary bottleneck given 65.5 TFLOPS and 512 GB/s HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1750", "title": "Process Group Topology for 3D Parallelism", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should TP or DP process groups be mapped to intra-node NVLink for TP=8, PP=16, DP=8 training, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1751", "title": "Counting Data-Parallel Process Groups in a 3D Layout", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Under TP=8, PP=8, DP=8 on 512 GPUs, how many DP process groups must be initialized, and which 8 ranks belong to each — and which links does the DP AllReduce traverse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1752", "title": "Systolic Array Padding Underutilization", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the MXU only 12.5% utilized despite unsaturated HBM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1753", "title": "Memory Footprint for Progressive VLM Deployment", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the static weight memory footprints for the 86B FP16 cloud model, 7B INT8 edge model, and 1.5B INT4 mobile model?", "chain_ids": ["cloud-chain-auto-008-17"], "chain_positions": {"cloud-chain-auto-008-17": 0}, "chain_tiers": {"cloud-chain-auto-008-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1754", "title": "Monolithic vs Multi-Core PE Arrays", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which PE array design delivers better throughput for independent batch-1 autoregressive decoding streams, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1755", "title": "Debugging Progressive Deployment Failures Across Tiers", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the edge-camera OOM crashes and the mobile tier’s 40% accuracy drop, and how would you fix them?", "chain_ids": ["cloud-chain-auto-008-17"], "chain_positions": {"cloud-chain-auto-008-17": 1}, "chain_tiers": {"cloud-chain-auto-008-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1756", "title": "Progressive Deployment Architecture Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a hardware-aware cascade across cameras, gateways, and cloud to meet the 100 ms SLA without losing recall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1757", "title": "Diagnosing OOM Failures in Long-Context Transformer Training", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 32,768-token CUDA OOM despite ZeRO-3 and checkpointing, and what exact attention change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1758", "title": "Evaluating 32K Context Scaling on A100 GPUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which of TP=8, FlashAttention-2 with checkpointing, or sparse attention should you choose to fit 32K context while preserving exact attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1759", "title": "Evaluating RoCEv2 QoS for Mixed Workloads", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you configure DSCP, ECN, PFC, and virtual lanes so inference stays under 50 ms P99 during training checkpoints?", "chain_ids": ["cloud-chain-auto-002-08"], "chain_positions": {"cloud-chain-auto-002-08": 2}, "chain_tiers": {"cloud-chain-auto-002-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1760", "title": "Diagnosing RoCEv2 Head-of-Line Blocking During Checkpoints", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do NCCL AllReduce timeouts correlate with 80 TB checkpointing despite no link oversubscription, and what network QoS fix would you apply?", "chain_ids": ["cloud-chain-auto-002-08"], "chain_positions": {"cloud-chain-auto-002-08": 1}, "chain_tiers": {"cloud-chain-auto-002-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1761", "title": "Attention Matrix Memory Calculation at 65K Context", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much FP16 memory does the 65,536-token attention probability matrix require with 32 heads, and what is the increase versus 4,096 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1762", "title": "RoCEv2 Traffic Class Allocation with DWRR", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With DWRR weights 1 for storage and 3 for training on a 400 Gbps link, what bandwidth does each class get and does training meet its 250 Gbps SLO?", "chain_ids": ["cloud-chain-auto-002-08"], "chain_positions": {"cloud-chain-auto-002-08": 0}, "chain_tiers": {"cloud-chain-auto-002-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1763", "title": "Diagnosing QAT Divergence in LLMs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT8 QAT diverge when layer norm outputs spike above 120, and what quantitative quantizer strategy would stabilize training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1764", "title": "Symmetric INT8 Scale and Error Simulation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For symmetric INT8 QAT with max magnitude 12.7, what are S, the INT8 value for activation 3.55, and the simulated quantization error?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1765", "title": "Diagnosing DP Congestion in Rail-Optimized Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused DP AllReduce traffic to saturate only IB switches 0 and 1 after restart, and what scheduling constraint fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1766", "title": "Evaluate Rail-Optimized DP Replica Scheduling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For TP=8, PP=4, DP=32 on 8 InfiniBand rails, should DP replicas use unrestricted placement or identical GPU-index pinning, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1767", "title": "Sizing Leaf-Spine Switches for Rail-Optimized Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 64-port 400 Gbps leaf and spine switches are needed across 8 rails, and what port allocation proves 1:1 bandwidth?", "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 0}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1768", "title": "Rail-Optimized DP Topology Bandwidth", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For TP=1, PP=8, DP=64 on a 4:1 oversubscribed spine, what is the effective per-GPU gradient sync bandwidth for naive versus rail-optimized placement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1769", "title": "Evaluate QAT for LLM Serving", "topic": "mlops-lifecycle", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why would INT8 QAT recover accuracy where PTQ failed for the 7B model, and what serving gains should you expect over FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1770", "title": "Diagnosing Stragglers in Multi-Node TP Fabrics", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did TP=8 AllReduce latency jump 300% after replacing the rail-optimized InfiniBand fabric with a generic ECMP leaf-spine?", "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 1}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1771", "title": "Evaluating Rail-Optimized Topologies for Cross-Node TP", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For TP=16 spanning two nodes, how do standard leaf-spine and rail-optimized topologies affect cross-node TP AllReduce performance?", "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 2}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1772", "title": "Diagnosing Block Storage IOPS Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the EBS gp3 data loader cap at 400 MB/s despite 1,000 MB/s provisioned throughput, and how should the dataset be restructured?", "chain_ids": ["cloud-chain-auto-003-04"], "chain_positions": {"cloud-chain-auto-003-04": 1}, "chain_tiers": {"cloud-chain-auto-003-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1773", "title": "NVMe Random IOPS Bottleneck in Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "At 400,000 random 4 KB audio clips per second, what throughput can the NVMe SSD deliver, and does it meet the training requirement?", "chain_ids": ["cloud-chain-auto-003-04"], "chain_positions": {"cloud-chain-auto-003-04": 0}, "chain_tiers": {"cloud-chain-auto-003-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1774", "title": "Evaluating Storage Upgrades vs Data Serialization", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you buy the 500,000-IOPS Extreme NVMe tier or migrate 1B 50KB images to WebDataset/TFRecord, and why quantitatively?", "chain_ids": ["cloud-chain-auto-003-04"], "chain_positions": {"cloud-chain-auto-003-04": 2}, "chain_tiers": {"cloud-chain-auto-003-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1775", "title": "CPU Reactive Burst Scaling vs GPU Pre-provisioning", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you absorb the 5,000 QPS launch spike with fast CPU spillover or pre-provision enough GPUs, and what is the cost trade-off?", "chain_ids": ["cloud-chain-auto-001-14"], "chain_positions": {"cloud-chain-auto-001-14": 1}, "chain_tiers": {"cloud-chain-auto-001-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1776", "title": "Hybrid CPU-Spillover for Flash Spikes", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the bottleneck during the 6,000 RPS flash-sale spike, and how would you design CPU spillover to cover the 4-minute GPU warmup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1777", "title": "Absorbing Traffic Spikes with CPU Reactive Scaling", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many 8-vCPU CPU instances are needed for the 2,400 QPS overflow, and how does CPU cost per inference compare with GPU?", "chain_ids": ["cloud-chain-auto-001-14"], "chain_positions": {"cloud-chain-auto-001-14": 0}, "chain_tiers": {"cloud-chain-auto-001-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1778", "title": "RDMA vs TCP/IP Kernel Overhead Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much CPU processing time is saved by using RDMA instead of TCP/IP to transfer the 40 GB FP32 gradient tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1779", "title": "Diagnosing RDMA Fallback and Kernel Overhead", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What does 45μs NCCL latency with high ksoftirqd usage indicate, and how would you restore expected InfiniBand AllReduce throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1780", "title": "Calculating GPU Cluster Resource Fragmentation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many 4-GPU evaluation jobs can start immediately after 15 distinct 6-GPU training jobs, and how many GPUs are stranded?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1781", "title": "Evaluating RoCEv2 vs TCP for LLM Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 2,048-GPU cluster use tuned TCP/IP over 400 Gbps Ethernet or RoCEv2 for AllReduce, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1782", "title": "Multi-Dimensional Resource Fragmentation and GPU Stranding", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why are Job A pods pending despite 200 idle GPUs, and should you fix it with scheduler bin-packing, re-profiling Job B, or both?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1783", "title": "REST API Serialization Overhead", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 64 embeddings of 1024 float32 values over JSON, what is the payload size, serialization time, and primary latency bottleneck?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 1}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1784", "title": "Evaluating Public REST to gRPC Migration for Vector APIs", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you replace the public REST/JSON embedding API entirely with gRPC, or use a hybrid interface, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1785", "title": "Diagnosing Multi-Tenant GPU Resource Fragmentation", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is an 8-H100, 1.5 TB fine-tuning job pending when the Kubernetes cluster shows 120 free GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1786", "title": "REST Serialization Bottleneck in Inference", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural bottleneck causes 98% CPU utilization and 15% T4 utilization for 500 JSON-encoded 1024-dim embeddings per request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1787", "title": "Diagnosing Monolithic Accelerator Fab Rejection", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would a foundry reject a 1,250 mm² monolithic 5 nm AI accelerator, and what architectural shift is required?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 2}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1788", "title": "Yield Calculation at the Reticle Limit", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using Y = e^(-AD), what are the expected yields for the 858 mm² monolithic die and each 215 mm² chiplet at 0.1 defects/cm²?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1789", "title": "Monolithic vs. Chiplet Accelerator Evaluation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Considering 25% monolithic yield, 85% chiplet yield, 20 ns interposer latency, and 40 W overhead, which design should go to mass production for training a 500B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1790", "title": "Compute Cost of Forward vs. Reverse Mode Autodiff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What wall-clock time would reverse-mode versus forward-mode AD take to compute the full gradient for one batch of the 100M-parameter model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1791", "title": "Custom Autograd vs Reverse-Linked Graph", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which attention implementation should you choose: native PyTorch autograd saving all intermediates or a custom autograd function that recomputes them?", "chain_ids": ["cloud-chain-auto-008-08"], "chain_positions": {"cloud-chain-auto-008-08": 2}, "chain_tiers": {"cloud-chain-auto-008-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1792", "title": "Diagnosing OOM in Reverse Mode Differentiation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does VRAM spike and OOM at loss.backward() after the forward pass, and how can training fit without reducing batch size?", "chain_ids": ["cloud-chain-auto-008-08"], "chain_positions": {"cloud-chain-auto-008-08": 1}, "chain_tiers": {"cloud-chain-auto-008-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1793", "title": "Diagnosing Autograd Memory Leaks in Training Loops", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does GPU memory grow by 8 GB per validation step when the loop does epoch_loss += loss, and how should it be fixed?", "chain_ids": ["cloud-chain-auto-008-07"], "chain_positions": {"cloud-chain-auto-008-07": 1}, "chain_tiers": {"cloud-chain-auto-008-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1794", "title": "Evaluating Autograd Engines: Forward vs Reverse Mode", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you switch to Forward Mode Differentiation for the 50B model, and what should you use instead to reduce activation memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1795", "title": "Ring AllReduce Network Bandwidth Calculation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum Ring AllReduce time to synchronize 4 GB of gradients across 16 nodes on 100 Gbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1796", "title": "Validation Loop Memory Leak", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory will the retained validation graph consume over 200 batches, and what failure or fix should you expect?", "chain_ids": ["cloud-chain-auto-008-07"], "chain_positions": {"cloud-chain-auto-008-07": 0}, "chain_tiers": {"cloud-chain-auto-008-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1797", "title": "Evaluating Ring AllReduce Bottlenecks at Scale", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For synchronizing FP16 gradients of the 175B model on 64 A100s, should you use a Parameter Server or Ring AllReduce, and why?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 4}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1798", "title": "Mitigating Outliers with Robust Loss", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you change the loss to prevent FP16 overflows from 5% extreme outliers without giving up mixed precision?", "chain_ids": ["cloud-chain-auto-023-03"], "chain_positions": {"cloud-chain-auto-023-03": 0}, "chain_tiers": {"cloud-chain-auto-023-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1799", "title": "Diagnosing Gradient Explosions from Corrupted Cloud Data", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the NaN gradient crashes, and how would you modify the objective to stabilize training without filtering the 50 TB dataset?", "chain_ids": ["cloud-chain-auto-023-03"], "chain_positions": {"cloud-chain-auto-023-03": 1}, "chain_tiers": {"cloud-chain-auto-023-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1800", "title": "RoCEv2 Goodput for Small Tensor Parallel Messages", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum theoretical payload goodput for 256-byte RoCEv2 messages on the 400 Gbps link with 80 bytes of overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1801", "title": "Robust Loss vs Co-teaching for Label Noise in a 50B Recommender", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the 50B recommender use robust loss or Co-teaching to handle 8% label noise, and what are the hardware trade-offs?", "chain_ids": ["cloud-chain-auto-023-03"], "chain_positions": {"cloud-chain-auto-023-03": 2}, "chain_tiers": {"cloud-chain-auto-023-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1802", "title": "Diagnosing Flat Ring AllReduce Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the flat Ring AllReduce taking about 1.58 seconds, and what collective topology should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1803", "title": "Diagnosing RoCEv2 PFC Storms", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose this bottleneck, and what configuration changes stabilize the fabric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1804", "title": "Evaluating RoCEv2 vs InfiniBand for H100 Clusters", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which 400 Gbps fabric would you choose for the 2,048-GPU MoE cluster, considering incast, tail latency, and TCO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1805", "title": "Evaluating Centralized vs Decentralized RBAC in Distributed ML Data Lakes", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design RBAC across Redis, Iceberg/S3, and archives to satisfy compliance while meeting a 5ms P99 latency SLA for feature serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1806", "title": "RBAC Policy Binding Compression Factor", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What reduction factor in IAM policy bindings does RBAC achieve versus direct per-dataset user permissions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1807", "title": "RBAC API Rate Limiting Bottleneck in Distributed Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the dataloader I/O collapse, and how should RBAC be redesigned to avoid per-object IAM checks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1808", "title": "Roofline Model Analysis on A100", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the model's theoretical maximum throughput on the GPU, and is it compute-bound or memory-bound?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 2}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1809", "title": "Runtime Input Validation Latency Budget", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many CPU cycles per feature dimension are available for validating 1024 float32 features within the 10 microsecond latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1810", "title": "Diagnosing LLM Decode Inefficiency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the Triton compute optimization fail for token decoding, and what is the fundamental Roofline bottleneck?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 2}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1811", "title": "H100 Migration for LLM Decoding", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you migrate the low-batch 70B decoding service from A100 to H100, given 2 FLOPs/byte arithmetic intensity and 2x cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1812", "title": "Mitigating Sponge Attacks via Runtime Input Validation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which validation strategy should protect the H100 LLM fleet from sponge examples while staying under the 20ms P99 overhead budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1813", "title": "Diagnosing Image Decompression Bombs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can requests under the 5MB limit cause 14GB preprocessing spikes, and what validation should be added before decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1814", "title": "Runtime Entropy Monitoring for Adversarial Shifts", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum adversarial request rate is needed to push the 1-minute average entropy above the 1.2-bit alert threshold?", "chain_ids": ["cloud-chain-auto-secondary-017-15"], "chain_positions": {"cloud-chain-auto-secondary-017-15": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1815", "title": "Diagnosing Runaway Generation from Prompt Injections", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using only runtime output monitoring metrics, how do you diagnose this root cause and differentiate it from normal heavy usage?", "chain_ids": ["cloud-chain-auto-secondary-017-15"], "chain_positions": {"cloud-chain-auto-secondary-017-15": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1816", "title": "Latency SLAs for Real-Time Saliency Maps", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What latency would 50-step Integrated Gradients add, and why should you use a vanilla Saliency Map to meet the 30ms SLA?", "chain_ids": ["cloud-chain-auto-011-04"], "chain_positions": {"cloud-chain-auto-011-04": 0}, "chain_tiers": {"cloud-chain-auto-011-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1817", "title": "Explainability Latency Bottleneck on T4 GPUs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling Saliency Maps spike p99 latency to 280ms and drop GPU utilization despite the expected 75ms compute cost?", "chain_ids": ["cloud-chain-auto-011-04"], "chain_positions": {"cloud-chain-auto-011-04": 1}, "chain_tiers": {"cloud-chain-auto-011-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1818", "title": "Explainability Latency Trade-offs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the medical image service use Saliency Maps or 50-step Integrated Gradients to satisfy the 150ms p99 explanation SLA?", "chain_ids": ["cloud-chain-auto-011-04"], "chain_positions": {"cloud-chain-auto-011-04": 2}, "chain_tiers": {"cloud-chain-auto-011-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1820", "title": "A100 Multi-Node Scaling Efficiency", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the two-node scaling efficiency, and how does it change the cost per 1M samples trained versus one 8-GPU node?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1821", "title": "Diagnosing Elastic Scaling Collapse", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does throughput rise only 15% when scaling from 64 to 128 GPUs with a static global batch size, and what should the scheduler do?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1822", "title": "Evaluating GPU Allocation Limits via Scaling Efficiency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you evaluate allocating the requested 128 GPUs versus capping the job at a lower GPU count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1823", "title": "Diagnosing Ingestion Bottlenecks in Synchronous Schema Validation", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the FastAPI ingestion latency spike and 503s after adding synchronous Python JSON schema validation, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-27"], "chain_positions": {"cloud-chain-auto-secondary-015-27": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1825", "title": "Evaluating Ingestion Schema Validation for High-Throughput Streams", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you split schema validation between the edge API gateway and stream processor for 500,000 JSON bid requests/sec?", "chain_ids": ["cloud-chain-auto-secondary-015-27"], "chain_positions": {"cloud-chain-auto-secondary-015-27": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1826", "title": "SecAgg Mask Generation Compute Overhead", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If a client's mobile SoC has a PRG throughput of 2.0 GB/s, how much latency does each client add generating SecAgg pairwise PRG masks for 500 clients?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1827", "title": "Confidential VM Cold Start Overheads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With pipelined network download, decryption, and PCIe transfer, how long is the 14GB model cold-start load into GPU memory?", "chain_ids": ["cloud-chain-auto-001-17"], "chain_positions": {"cloud-chain-auto-001-17": 0}, "chain_tiers": {"cloud-chain-auto-001-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1828", "title": "Hierarchical Secure Aggregation Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the global FL aggregator use one 5,000-client SecAgg group or 10 groups of 500, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1829", "title": "Capacity Reduction and Abstention", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many records are vulnerable initially, after capacity reduction, and after adding abstention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1830", "title": "SecAgg Dropout Recovery Compute Explosion", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the SecAgg dropout recovery latency spike when 5% of 10,000 devices drop offline, and how should it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1831", "title": "Evaluating Confidential Computing for LLM Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you enable Confidential Computing for the 70B medical LLM, and will it keep p99 latency under 200ms/token?", "chain_ids": ["cloud-chain-auto-001-17"], "chain_positions": {"cloud-chain-auto-001-17": 2}, "chain_tiers": {"cloud-chain-auto-001-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1832", "title": "Mitigating Memorization via Architectural Capacity Constraints", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural changes would make the 13B financial summarization LLM natively resist training data extraction attacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1833", "title": "Diagnosing High TTFT in Confidential GPU Enclaves", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the TTFT regression with Confidential Computing enabled while decoding throughput remains normal?", "chain_ids": ["cloud-chain-auto-001-17"], "chain_positions": {"cloud-chain-auto-001-17": 1}, "chain_tiers": {"cloud-chain-auto-001-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1834", "title": "LLM Extraction Risk and TTFT Tradeoff", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do these choices impact susceptibility to data extraction while maintaining TTFT under 300ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1835", "title": "SMPC Communication Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total communication time for one SMPC inference given 100 layers, 10 rounds per layer, 500 MB, 10 Gbps, and 5 ms RTT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1836", "title": "Diagnosing SMPC Latency Collapse", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why has SMPC throughput fallen to 0.2 QPS with low CPU and bandwidth usage, and should you upgrade to AES-NI instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1837", "title": "SMPC vs FL for Cross-Institution DNN Training", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the consortium use SPDZ-style SMPC or federated learning with differential privacy for the 100M-parameter model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1838", "title": "SimCLR Small-Batch Negative Collapse on A100s", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the SimCLR plateau at 45% linear-probe accuracy and 15% GPU memory utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1839", "title": "Masked Autoencoder Pre-training Compute Optimization", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many ExaFLOPs per epoch does MAE save versus full-patch training with 75% masking, 30 TFLOPs baseline, 2 TFLOPs decoder, and 1.2M images?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1840", "title": "LSTM Sequential Latency Bound on A100 GPUs", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical minimum latency for one 512-token LSTM sequence, and will increasing batch size to 256 reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1841", "title": "SimCLR vs MAE for ViT-Huge on A100 40GB GPUs", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you choose SimCLR or MAE to pre-train the 630M-parameter ViT-Huge on 256 A100 40GB GPUs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1842", "title": "LLM Data Ingestion with Sequential Streaming Formats", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the metadata overhead and throughput differ between fetching 100M individual 100KB JSON files versus streaming 10,000 sequential 1GB shards?", "chain_ids": ["cloud-chain-auto-secondary-015-13"], "chain_positions": {"cloud-chain-auto-secondary-015-13": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1843", "title": "Evaluating Streaming Formats for VLM Training", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you migrate the 5TB dataset to a sequential streaming format like WebDataset (TAR) or tune Parquet, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-13"], "chain_positions": {"cloud-chain-auto-secondary-015-13": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1844", "title": "Diagnosing Object Store API Bottlenecks", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 500TB multimodal pipeline limited to 1.5 GB/s despite a 100 Gbps link, and what data layout change would fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-13"], "chain_positions": {"cloud-chain-auto-secondary-015-13": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1846", "title": "Evaluating RNN Optimization Strategies on A100s", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can CUDA graphs and custom fused kernels make the 2048-step LSTM reach Transformer-level GPU utilization, or is the bottleneck algorithmic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1847", "title": "Data Pipeline Throughput Calculation for NVMe SSDs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What read throughput do 500M 4KB JSON files achieve on the NVMe SSD, does it starve the 4 GB/s GPU pipeline, and how should you store the data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1848", "title": "JSON Serialization Overhead in Batch APIs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What serialization overhead does JSON add versus Protobuf for 500 candidate 512-dim float32 embeddings, and how would latency change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1849", "title": "Evaluating Sequential Storage Patterns for Distributed Training", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are 64 GPUs starved while reading 500M individual 500KB JPEGs from Lustre, and what data layout should you use?", "chain_ids": ["cloud-chain-auto-011-13"], "chain_positions": {"cloud-chain-auto-011-13": 2}, "chain_tiers": {"cloud-chain-auto-011-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1850", "title": "Diagnosing NVMe I/O Starvation in Vision Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the storage system failing to reach its advertised throughput, and how do you confirm the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1851", "title": "Evaluating RPC Frameworks for High-Throughput Embeddings", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the embeddings API migrate from JSON REST to gRPC/Protobuf bytes or rely on response caching to maximize GPU saturation, and why?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 3}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1852", "title": "Bottleneck Analysis in Python REST/JSON Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 latency over 35ms when ONNX inference is 5ms, and what mitigation would you propose?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 2}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1853", "title": "Evaluating 112G vs 224G SerDes Trade-offs", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 51.2T switch fabric move from 112G to 224G PAM4 SerDes given 2.5m intra-rack reach requirements, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1854", "title": "Strict SLO in Ad Recommendation Systems", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Under an additive p99 budget, what max dispatch delay keeps p99 under 100ms for batch size 16 if model latency is 12ms + 1ms per extra request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1855", "title": "SLO Allocation in Multi-Stage Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the feed use static per-service latency budgets or dynamic leftover timeouts to meet the 250ms global P99 SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1856", "title": "Diagnosing P99 SLO Violations in Dynamic Batching", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the DLRM pipeline violate the 150ms P99 SLO, and what maximum dynamic batching timeout should it use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1857", "title": "Debugging 112G PAM4 SerDes FEC Failures", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze and resolve this high-temperature pre-FEC BER bottleneck at the switch level?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1858", "title": "Autoscaling Ramp-up and Boot Time Lag Calculation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much buffer capacity in RPS and nodes is required to avoid SLA violations during the 3-minute autoscaling boot window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1859", "title": "LLM Autoscaling and KV Cache Bottlenecks", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does TTFT exceed 4 seconds when GPU compute utilization stays below 75%, and what metric should autoscaling use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1860", "title": "Evaluating Custom Serving Engine ROI", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is a 3-month delay to build a 2x-throughput inference engine financially justified at 1,000 QPS and 1,024 output tokens?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 3}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1861", "title": "Calculating the Training vs. Inference Cost Crossover", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "In how many days will inference costs exceed the initial 30-day training cost for the 70B LLM API?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 1}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1862", "title": "Analyzing the Inference OpEx Explosion", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why has serving OpEx eclipsed the $300k training CapEx for the 30B recommender, and what architectural inefficiency should be fixed?", "chain_ids": ["cloud-chain-auto-001-18"], "chain_positions": {"cloud-chain-auto-001-18": 2}, "chain_tiers": {"cloud-chain-auto-001-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1863", "title": "Diagnosing SLO Violations from Batch Accumulation", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is P99 TTFT spiking above 1.6 seconds at 80 RPS with dynamic batch size 128, despite sufficient GPU throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1864", "title": "Batching Limits Under Strict Latency SLAs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum dynamic batch size meets the 60ms latency SLA, and what serving throughput does it allow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1865", "title": "Batching and Scheduling for 7B LLM TTFT and Inter-Token SLAs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should batching and scheduling change to meet TTFT <200ms and inter-token latency <50ms for the 7B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1866", "title": "LLM Serving Autoscaling Under Burst", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you handle 3x traffic spikes within 60s given 4-minute LLM cold starts without paying for a permanent 3x warm buffer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1867", "title": "Calculating LLM Microservice Serving Tax", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the absolute serving tax and final TTFT for the Gateway-Tokenizer-Model Worker request path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1868", "title": "Diagnosing Microservice Serving Tax", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing p99 latency to bloat from 20ms GPU compute to 85ms in the JSON/REST recommender pipeline, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1869", "title": "Evaluating Microservice Serving Tax for ML Pipelines", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you split the 120ms monolithic recommender into gRPC microservices with 2MB tensor payloads under a 200ms p99 SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1870", "title": "Sizing Cloud Shadow Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many instances are needed to shadow 12,000 RPS at no more than 75% utilization?", "chain_ids": ["cloud-chain-auto-001-04"], "chain_positions": {"cloud-chain-auto-001-04": 0}, "chain_tiers": {"cloud-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1871", "title": "Diagnosing Latency Spikes in Synchronous Shadow Deployments", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did 100% shadow deployment add 25ms to user-facing P99 latency even though the shadow predictions are discarded?", "chain_ids": ["cloud-chain-auto-001-04"], "chain_positions": {"cloud-chain-auto-001-04": 1}, "chain_tiers": {"cloud-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1872", "title": "Architecting Shadow Deployments for Latency-Sensitive APIs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What risks does application-layer shadowing create at 5,000 QPS, and what production-grade alternative should you use?", "chain_ids": ["cloud-chain-auto-001-04"], "chain_positions": {"cloud-chain-auto-001-04": 2}, "chain_tiers": {"cloud-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1873", "title": "Diagnosing Object Storage Stalls in Global Shuffling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does perfect global shuffling over 100M 100KB samples from object storage stall the pipeline, and what shuffling strategy should replace it?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 1}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1874", "title": "Sizing Cloud Shard Shuffle Buffers", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What K and B would you configure to maximize randomness under 64GB RAM while saturating the 10Gbps S3 link?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 0}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1875", "title": "Evaluating Global vs. Shard-Level Shuffling for 10TB LLM Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which shuffling design would you choose for the 10TB S3 dataset on 1024 GPUs, and why?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 2}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1876", "title": "70B LLM FSDP Checkpoint Time Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the monolithic and distributed sharded checkpoint write times, and what bottlenecks each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1877", "title": "Diagnosing Sharded Checkpoint Metadata Stalls", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the >5 minute sharded checkpoint stalls on NFS, and how would you fix them?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 3}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1878", "title": "SDC Rate and Optimal Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the system-level SDC MTBF on 8,192 GPUs, and what checkpoint interval is optimal with 6-minute checkpoints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1879", "title": "Root-Causing Deterministic Loss Divergence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically diagnose the root cause of the deterministic loss divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1880", "title": "Mitigating Silent Data Corruption in LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate the trade-offs and select an SDC mitigation strategy to avoid poisoned checkpoints without drastically reducing Goodput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1881", "title": "Evaluating Sharded Checkpointing for 70B LLMs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you migrate from global to distributed sharded checkpointing, and what trade-offs arise when resuming on a different cluster size?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1882", "title": "Evaluating Silent Degradation in Embedding Fleets", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the trade-offs to catch this silent degradation within 5 minutes and <2% latency overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1884", "title": "Early-Exit Bypass with Synthetic Benchmark Data", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why was the BERT-base benchmark invalid, and how should CI benchmark throughput without masking quality regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1885", "title": "Evaluating Proxy Metrics for Delayed-Label Drift Detection", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which proxy metric strategy would detect the new synthetic fraud vector within 24 hours, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1886", "title": "Fraud Detection Silent Failure Investigation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What root causes could explain the complaint spike despite healthy infrastructure, and how would you detect them systematically?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1887", "title": "Silicon Interposer Edge Density Calculation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total memory bandwidth and minimum compute-die edge length needed to interface with all 6 HBM3 stacks?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 0}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1888", "title": "Evaluating 2.5D Silicon Interposer Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which packaging option should you choose for memory-bound trillion-parameter LLM inference, and how does it affect TCO?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 2}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1889", "title": "Root-Causing 2.5D Packaging HBM Failures", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What packaging-level physical issue is causing the edge HBM stacks to fail memory training?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 1}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1890", "title": "SIMT Latency Hiding via Little's Law", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many active threads per SM are required to saturate 2.0 TB/s HBM bandwidth and hide 400 ns memory latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1891", "title": "Analyzing SIMT Register Pressure and Memory Stalls", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is this A100 kernel latency-bound at only 30% HBM bandwidth, and what would you change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1892", "title": "SIMT Warp Mapping for Sparse Attention", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which sparse attention mapping strategy better leverages the SIMT execution model to hide the ~300 cycle global memory latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1893", "title": "CPU Overhead Calculation for SR-IOV Network Bypass", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many host CPU cores per node does SR-IOV save by bypassing the vSwitch for 4x400Gbps traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1894", "title": "Evaluating SR-IOV vs Virtio for RoCEv2", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the RoCEv2 fabric use virtio-net or SR-IOV, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1895", "title": "JIT Compiler SSA Graph Memory Footprint Calculation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the naive memory footprint of the h state variables in the unrolled SSA graph before liveness or buffer reuse?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 1}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1896", "title": "Root-Causing 400GbE Bottlenecks in Virtualized GPU Nodes", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural misconfiguration is causing this massive throughput collapse and forcing high host CPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1897", "title": "Skip Connection Gradient and Memory Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the first-layer gradient magnitudes with and without skips, and how much FP16 memory does one bypass tensor require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1898", "title": "Evaluating Memory Costs of Skip Connections", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why can residual blocks cause OOM during the backward pass on 24GB GPUs, and how would you preserve gradient flow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1899", "title": "Evaluating SSA Form for JIT Compiler Optimization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you bypass SSA generation to save 15 minutes, and how would that affect liveness, fusion, and throughput?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 3}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1900", "title": "Diagnosing Gradient Collapse in Deep CNNs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did removing bypass paths stall the 150-layer 3D CNN, and how would you restore trainability within the memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1901", "title": "Diagnosing Metadata Server Collapse on Lustre", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing low GPU utilization and 100% Lustre MDS CPU, and how should the dataset be stored instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1902", "title": "Evaluating Storage Bottlenecks for Millions of Tiny Images", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option should the team choose to fix 15% GPU utilization, and why is it better than storage hardware upgrades?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1903", "title": "Metadata IOPS Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What IOPS and bandwidth are required, what bottlenecks utilization, and how much would 100MB WebDataset shards reduce IOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1904", "title": "Debugging SSA Compiler OOM", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does SSA generation for the unrolled generation loop blow up CPU memory, and what structural fix avoids it?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 2}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1905", "title": "Balancing W8A8 Outliers with SmoothQuant", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What SmoothQuant scale s_j balances activation max 121 and weight max 1, and what are the resulting maxima?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 0}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1906", "title": "DLRM Embedding Lookup Bandwidth Calculation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the memory read volume per batch, and what is the theoretical bandwidth-bound maximum batches per second for the lookup stage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1907", "title": "Evaluating SmoothQuant for 175B LLM Serving", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which INT8 activation quantization approach would you choose to restore accuracy while maintaining dense Tensor Core throughput, and why?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 2}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1908", "title": "Diagnosing DLRM Embedding Lookup Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why doesn't increasing the batch size fix the SM utilization, and what is the fundamental root cause of this performance ceiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1909", "title": "Diagnosing Accuracy Collapse in INT8 LLM Deployments", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze and resolve this bottleneck without incurring the massive latency overhead of dynamic per-channel activation quantization?", "chain_ids": ["cloud-chain-auto-014-10"], "chain_positions": {"cloud-chain-auto-014-10": 1}, "chain_tiers": {"cloud-chain-auto-014-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1910", "title": "DLRM Embedding Sparse Scatter Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What data transfer rate and bandwidth utilization do 10 million 128-byte sparse accesses produce, and what bottleneck causes it?", "chain_ids": ["cloud-chain-auto-008-10"], "chain_positions": {"cloud-chain-auto-008-10": 0}, "chain_tiers": {"cloud-chain-auto-008-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1911", "title": "Evaluating Embedding Sharding in DLRMs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you parallelize the 400GB embeddings and 2GB dense layers across 8 A100s to minimize step latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1912", "title": "Diagnosing Low Utilization in DLRM Lookups", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the pipeline stalling, and what specific architectural constraint is causing this severe underutilization?", "chain_ids": ["cloud-chain-auto-008-10"], "chain_positions": {"cloud-chain-auto-008-10": 1}, "chain_tiers": {"cloud-chain-auto-008-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1913", "title": "Evaluating DLRM Sparse Embedding Placement", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 500GB DLRM embeddings live entirely in CPU DRAM or use a hybrid GPU-HBM cache, and why?", "chain_ids": ["cloud-chain-auto-008-10"], "chain_positions": {"cloud-chain-auto-008-10": 2}, "chain_tiers": {"cloud-chain-auto-008-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1914", "title": "Diagnosing I/O Bottlenecks in Stochastic Data Loading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What causes high CPU I/O wait and low page-cache hits when PyTorch shuffles individual files, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1915", "title": "Evaluating OS Page Cache for Stochastic ML Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you rely on POSIX individual-file reads and the OS page cache for the 100 TB ViT-H dataset, and what should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1916", "title": "SFU Bottlenecks in Custom Routing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the custom MoE gating function taking 35% of the forward pass despite being O(N), and how would you remove the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1917", "title": "Calculating Effective Bandwidth Under Stochastic I/O", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you eliminate the 500 MB/s random-read bottleneck and reach the 1.6 GB/s needed for 16,000 images/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1918", "title": "Evaluating Dedicated SFUs vs Vector ALUs for Transformers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator architecture is better for Transformer serving: flexible ALUs or 2-cycle SFUs that cost 12% die area, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1919", "title": "Profiling SFU vs Memory Bottlenecks in GeLU", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 8192 x 32768 FP16 GeLU on an A100, is the standalone kernel SFU-compute-bound or HBM-bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1920", "title": "Covariance Pruning for Backdoors", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many examples should you prune from the suspicious 50,000-example class using a 1.5x margin over a 0.5% poison rate?", "chain_ids": ["cloud-chain-auto-003-20"], "chain_positions": {"cloud-chain-auto-003-20": 0}, "chain_tiers": {"cloud-chain-auto-003-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1921", "title": "Evaluating Spectral Signatures Computation", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the team's concern about a prohibitive 1.2M-vector SVD valid, and how should Spectral Signatures actually be computed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1922", "title": "Diagnosing Multi-Tenant Cache Side-Channels", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hardware-level vulnerability is leaking Tenant A's weights via branch mispredictions and L3 cache timing, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1923", "title": "Identifying Poisoned Data via Spectral Signatures", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use the 4.5x top singular value in the suspected class to identify and filter poisoned examples?", "chain_ids": ["cloud-chain-auto-003-20"], "chain_positions": {"cloud-chain-auto-003-20": 1}, "chain_tiers": {"cloud-chain-auto-003-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1925", "title": "Mitigating Spectre in Multi-Tenant NLP Gateways", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you enforce IBPB and disable SMT for the shared EPYC tokenization tier, or use stronger isolation instead, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1926", "title": "Spot Instance Preemption TCO", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the expected total cost and wall-clock time of the 100-hour 8-node run on Spot versus On-Demand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1927", "title": "Diagnosing Spot Preemption Hangs in Distributed Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this specific failure pattern, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1928", "title": "Mitigating Watermark Spurious Correlations via Group Sampling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What resampling weights are needed to perfectly balance Watermark/No-Watermark x Positive/Negative groups in each 1024-image batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1930", "title": "Evaluating Spot Instance Cost vs Recovery Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do pure On-Demand and pure Spot compare for the 14-day 13B LLM run in total cost and time-to-market?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1932", "title": "Diagnosing Missing Deployment Invariants", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What structural invariant is missing from the Evaluation-to-Deployment contract for the 7B FP16 model on 24GB instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1933", "title": "VRAM Constraint Validation for Model Promotion", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does the FP16 7B model satisfy the T4 staging memory contract, and if not what minimal quantization bit-width passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1934", "title": "LLM Serving Stage Invariants", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What static invariants and dynamic profiling checks should the Optimization-to-Serving contract use to guarantee TBT < 50ms on L4 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1935", "title": "Diagnosing Staged Data Pipeline Starvation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you remove the 210ms sequential data-pipeline stall and maximize utilization on the node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1937", "title": "Optimizing Retraining Interval for E-Commerce", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What retraining interval minimizes average daily compute plus staleness cost for the recommendation model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1938", "title": "Diagnosing Optimal Retraining Frequency for Ad CTR Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Is retraining every 12 hours optimal for the CTR model, or what retraining interval minimizes compute plus staleness cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1939", "title": "Ad-Model Retraining Cadence Under Drift Costs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What static retraining interval minimizes total business cost for the ad-click model given $5,000 retrains and rising daily staleness loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1940", "title": "Evaluating Staged Pipeline Bottlenecks", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which throughput improvement option—PCIe Gen5, more num_workers, or GPU DALI augmentation—should you choose, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1941", "title": "Stateful LLM Serving Fault Tolerance", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which fault-tolerance strategy should you use to minimize P99 recovery latency and operational overhead, and how do they quantitatively compare?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 2}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1942", "title": "Cascading KV Cache OOMs in Stateful LLM Serving", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did reshuffling 2,000 stateful LLM sessions cause OOMs and 15s P99 TTFT, and how should failover be redesigned?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 1}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1943", "title": "Stateful KV Cache Recovery Tradeoffs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 400 crashed sessions, is a Redis KV-cache restore over a 100 Gbps network or a full recompute prefill faster, and what are the recovery times?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 0}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1944", "title": "LLM Stateless Serving Database Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the stateless LLM serving design bottleneck on PostgreSQL at 5,000 req/sec despite adding GPU worker pods?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1945", "title": "Calculating Maximum Static Batch Size for Strict SLA", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If your strict P99 latency SLA is 50ms, what is the maximum static batch size you can configure to ensure the first request in any batch meets the SLA limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1946", "title": "Stateless vs Stateful LLM Serving Architecture", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 70B chat service use stateless routing or stateful sticky KV-cache serving, and how do failures change the decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1947", "title": "Off-Peak Latency Spikes in Static Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this performance degradation under low load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1948", "title": "Diagnosing Static Graph Compilation OOM", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does static graph JIT compilation OOM on a 256GB host for dynamic sequence lengths, and how would you bound the compiler memory?", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 2}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1949", "title": "Evaluating Static Batching Limits Under Variable Load", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is purely static batching viable for this workload, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1950", "title": "Memory Traffic Savings via Operator Fusion", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much HBM traffic do eager versus fused static execution use for the three FP16 pointwise ops, and how long should the fused kernel take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1951", "title": "Evaluating Static Graphs for DLRM Inference", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the DLRM serving stack use PyTorch eager mode or an ahead-of-time static graph, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1952", "title": "Diagnosing Peak Load Bottlenecks in Recommendation Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are peak login requests saturating the GPUs, and how would static inference eliminate the latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1953", "title": "Scaling DLRM Static Inference Caching", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long will 8 GPUs take to refresh recommendations for 50M users, and how much raw Redis memory stores top-50 64-bit IDs per user?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1954", "title": "Diagnosing High Idle Power in GPU Clusters", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the H100 cluster still draw 60% of peak power at 20% utilization after inlet temperatures rise to 28°C?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1955", "title": "Datacenter Idle GPU Static Power Scaling", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total facility-level power wasted by the 800 idle GPUs at 70°C with a datacenter PUE of 1.2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1956", "title": "Datacenter Cooling Trade-off: Static Power vs CapEx", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the $4M liquid cooling system justified on energy savings alone over 1 year for the 8,000 H100 training cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1957", "title": "Quantifying Non-Stationarity Financial Impact", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What daily revenue loss does the Q3 drift cause, and how quickly must detection and retraining finish to keep losses below $50,000?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1958", "title": "Evaluating Static Inference Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you precompute all 20M feed profiles in Redis, keep dynamic T4 inference, or use a hybrid architecture at 50,000 QPS, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1960", "title": "Diagnosing Loan Approval Parity", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the Statistical Parity Difference for Groups A and B, and why is 92% accuracy not enough for compliance?", "chain_ids": ["cloud-chain-auto-003-07"], "chain_positions": {"cloud-chain-auto-003-07": 1}, "chain_tiers": {"cloud-chain-auto-003-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1961", "title": "Evaluating Stationarity Violations in Cloud ML", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use weekly sliding-window retraining ($1,200/week) or real-time online learning to adapt to this shifted loan model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1963", "title": "KV-Cache Affinity in Canary Rollouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many sessions lose KV-cache affinity when adding 10 canary nodes with modulo hashing versus consistent hashing?", "chain_ids": ["cloud-chain-auto-001-16"], "chain_positions": {"cloud-chain-auto-001-16": 0}, "chain_tiers": {"cloud-chain-auto-001-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1964", "title": "Evaluating L7 Sticky Routing for LLM Canary Rollouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you pin chats with L7 consistent-hash sticky routing on session_id or a model-version JWT during the 10% canary, and why?", "chain_ids": ["cloud-chain-auto-001-16"], "chain_positions": {"cloud-chain-auto-001-16": 2}, "chain_tiers": {"cloud-chain-auto-001-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1965", "title": "Diagnosing Context Amnesia in Canary Rollouts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the context amnesia during the 10% canary, and what networking-level fix would you implement?", "chain_ids": ["cloud-chain-auto-001-16"], "chain_positions": {"cloud-chain-auto-001-16": 1}, "chain_tiers": {"cloud-chain-auto-001-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1966", "title": "Diagnosing Edge-to-Cloud Satellite Storage Saturation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can the 5 Mbps, 6-hour satellite window offload 50GB/day, and how should the pipeline avoid filling the 2TB buffer?", "chain_ids": ["cloud-chain-auto-002-07"], "chain_positions": {"cloud-chain-auto-002-07": 1}, "chain_tiers": {"cloud-chain-auto-002-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1968", "title": "Sizing Edge Buffers for Satellite ML Pipelines", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the nightly transfer capacity, is it enough for 180GB/day, and how large an SSD buffer is needed for 4 blackout days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1969", "title": "Store-and-Forward Telemetry over Cellular and Satellite Links", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the edge devices use synchronous daytime streaming or store-and-forward over the 2-hour satellite window, and why?", "chain_ids": ["cloud-chain-auto-002-07"], "chain_positions": {"cloud-chain-auto-002-07": 2}, "chain_tiers": {"cloud-chain-auto-002-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1970", "title": "Federated Learning Over-selection for Stragglers", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many clients should be selected to get 1,000 fast updates without stragglers, and what round latency reduction results?", "chain_ids": ["cloud-chain-auto-004-09"], "chain_positions": {"cloud-chain-auto-004-09": 0}, "chain_tiers": {"cloud-chain-auto-004-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1971", "title": "Diagnosing Cross-Silo Straggler Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you mitigate this straggler effect while preserving the statistical contribution of high-latency nodes?", "chain_ids": ["cloud-chain-auto-004-09"], "chain_positions": {"cloud-chain-auto-004-09": 1}, "chain_tiers": {"cloud-chain-auto-004-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1972", "title": "Evaluating Straggler Mitigation Strategies in Federated Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which should be the default straggler strategy, FedAsync or over-selecting 1,500 clients for the fastest 1,000, and why?", "chain_ids": ["cloud-chain-auto-004-09"], "chain_positions": {"cloud-chain-auto-004-09": 2}, "chain_tiers": {"cloud-chain-auto-004-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1973", "title": "Streaming Architecture for Strict Ad-Bidding SLAs", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you choose Flink with local state or Spark Streaming with Redis for 500,000 bid requests/sec under a 30ms P99 SLA, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-15"], "chain_positions": {"cloud-chain-auto-secondary-015-15": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1975", "title": "Diagnosing Single-Threaded Consumer Lag in Kafka", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the ingestion bottleneck when one CPU core is pegged and the other 15 are idle?", "chain_ids": ["cloud-chain-auto-secondary-015-15"], "chain_positions": {"cloud-chain-auto-secondary-015-15": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1976", "title": "Streaming Inference Batch Size Limits", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If you buffer chunks from a *single* audio stream to increase throughput, what is the maximum batch size you can accumulate without violating the SLA for the first chunk in the batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1977", "title": "A100 2:4 Structured Sparsity Memory Overhead", "topic": "extreme-quantization", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the compressed memory footprint of the 8192x8192 FP16 matrix under 2:4 sparsity, including metadata, and what are the exact memory savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1978", "title": "Evaluating 2:4 Structured Sparsity on A100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you deploy 70% unstructured pruning or 2:4 structured sparsity for the 7B model on A100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1979", "title": "Unstructured vs Structured Sparsity Slowdown on A100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did 75% unstructured pruning slow inference on A100s, and what sparsity mechanism is needed for acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1980", "title": "Streaming LLM TBT Spikes", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does P99 TBT spike at 40 streams on the 24GB instance, and what serving memory strategy fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1981", "title": "Stateful Streaming vs Stateless Batching ASR", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which ASR architecture should you deploy to meet the <300ms TTFW SLA for 5,000 sessions, and how would you size it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1982", "title": "Diagnosing A/B Test Interference in Social Networks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a +5% treatment lift produce only a +1.5% global content-creation increase, and how should the experiment be redesigned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1983", "title": "SUTVA Violations in Dispatch A/B Testing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the measured A/B delta and true rollout impact on rider wait time, and what test design fixes the SUTVA violation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1985", "title": "Diagnosing Synchronous Checkpoint Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the primary bottleneck causing 5-minute synchronous checkpoints when interconnect utilization is under 1%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1986", "title": "Evaluating Synchronous Checkpointing Overheads", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the synchronous checkpointing overhead for the 4.8TB state, and is a two-tier local-NVMe asynchronous strategy justified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1987", "title": "Synchronous Checkpoint Overhead Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long does each checkpoint pause training for a 300B mixed-precision Adam state, and what percentage overhead does a 20-minute cadence add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1988", "title": "Roofline Benchmarking of Custom Layers", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the layer's arithmetic intensity, Roofline-limited throughput, and bound type on the A100?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 1}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1989", "title": "Predicting Model Degradation Thresholds", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "When must retraining start to keep accuracy above 85.0%, and what is the annual compute cost of that retraining cadence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1990", "title": "Diagnosing CTR Degradation Post-Peak Event", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of this silent failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1991", "title": "Evaluating LLM Inference Benchmarks", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is the vendor's 3x latency reduction claim valid for batch-1 70B LLM serving, and how should you benchmark the real speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1992", "title": "Diagnosing Low TFLOPS in Single-Batch Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does batch-1 autoregressive inference achieve only 2-4 TFLOPS instead of the advertised 312 TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1993", "title": "Evaluating Mitigation Strategies for Recommendation System Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 100GB DLRM use daily full retraining or continuous 15-minute online learning to fight -0.5% weekly CTR drift, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1994", "title": "FP16 Activation Payload for Five LLM Layers", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How large is the FP16 activation payload for 5 layers with batch size 16, sequence length 1024, and hidden size 8192?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1995", "title": "Diagnosing Explainability Control Plane Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 400ms/token P99 spike, and how would you extract the 2GB of state without starving the GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1996", "title": "Systolic Array Peak Throughput Estimation", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical peak BF16 throughput in TFLOPS if each BF16 MAC counts as 2 FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1997", "title": "Systolic Array Sizing and Padding Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture is more efficient for the 64x1024 QKV projections without cross-request batching, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1998", "title": "TPU MXU Padding Underutilization for Small Dense Layers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes low TPU utilization for the [64,100] x [100,64] dense layer, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-1999", "title": "Fan-out Tail Latency Probability", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What percentage of scatter-gather requests will see 200ms or higher latency across all 50 services?", "chain_ids": ["cloud-chain-auto-025-15"], "chain_positions": {"cloud-chain-auto-025-15": 0}, "chain_tiers": {"cloud-chain-auto-025-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2000", "title": "Evaluating Hedged Requests for Fan-Out Inference", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can either hedging at 25ms or 2x GPU over-provisioning alone guarantee the 50ms system P99 SLA, and why?", "chain_ids": ["cloud-chain-auto-025-15"], "chain_positions": {"cloud-chain-auto-025-15": 2}, "chain_tiers": {"cloud-chain-auto-025-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2001", "title": "LLM Control Plane Explainability Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you log full layer activations or lightweight linear-probe scores for real-time explainability at 5,000 QPS, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2002", "title": "Diagnosing Parallel Fan-out Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Diagnose why the median user latency degraded in the 50-way scatter-gather system, and calculate the chance of hitting the 150ms tail?", "chain_ids": ["cloud-chain-auto-025-15"], "chain_positions": {"cloud-chain-auto-025-15": 1}, "chain_tiers": {"cloud-chain-auto-025-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2003", "title": "Evaluating Telemetry Aggregation for LLM Inference Fleets", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use 1% random sampling or node-level sidecar aggregation for the 50,000 QPS telemetry pipeline, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2004", "title": "LLM Inference Trace Saturation", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is saturating the 100 Gbps observability uplink, and what node-level aggregation strategy would you deploy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2005", "title": "Edge Telemetry Aggregation for High-Throughput Inference", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total egress bandwidth results from keeping 100% of errors, sampling 1% of successes, and sending 100 KB/s of metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2006", "title": "Tensor Core Bottlenecks in GEMM", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the arithmetic intensity, bottleneck type, and minimum execution time for the M=N=K=8192 FP16 GEMM on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2007", "title": "Evaluating Tensor Memory Layouts for Tensor Cores", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you switch the ResNet pipeline to NHWC despite three NCHW-only custom kernels, and when is the conversion cost justified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2008", "title": "Diagnosing Tensor Core Underutilization", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze and resolve the bottleneck preventing high Tensor Core utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2009", "title": "Diagnosing Non-Contiguous Tensor Memory Bottlenecks", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this degradation at the tensor abstraction level, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2010", "title": "Tensor Contiguity Memory Overhead", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the exact High Bandwidth Memory (HBM) traffic (in MiB) generated strictly by this .contiguous() operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2011", "title": "Optimizing Tensor Core Utilization via Dimension Padding", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you pad the FFN intermediate dimension from 3000 to 3072, and how do the Tensor Core trade-offs justify it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2012", "title": "Evaluating Payload-to-Tensor Transformation Architectures", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should clients send uncompressed FP16 NCHW tensors, or should Triton accept JPEGs and use DALI for GPU preprocessing, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2013", "title": "JSON to Tensor Parsing Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the JSON parsing overhead for the batch of 32 images, and what transport and preprocessing changes would fix the latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2014", "title": "CPU Bottleneck in Tensor Format Transformation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 45ms latency despite 1.5ms GPU compute, and how would you redesign the input path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2015", "title": "Diagnosing MFU Collapse in Cross-Node Tensor Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did increasing Tensor Parallelism from TP=8 to TP=16 collapse MFU, and how should parallelism cross node boundaries instead?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 1}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2016", "title": "Transposed Tensor Bandwidth Collapse", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does transposing the [32,16,4096,128] tensor before the custom kernel destroy memory bandwidth, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-017-39"], "chain_positions": {"cloud-chain-auto-secondary-017-39": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-39": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2017", "title": "Evaluating TP Topology Across Network Boundaries", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you serve the 175B model with TP=16 across nodes or TP=8 within each node plus PP=2 across nodes, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2018", "title": "Calculating NCHW Tensor Memory Offsets", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the linear element offset for index [2,128,32,16] in the contiguous NCHW [8,256,64,64] FP32 tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2019", "title": "Calculating 1D Tensor Parallelism Overhead", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For TP=8, what are the per-GPU MLP weight footprint and the All-Reduce activation payload size for the micro-batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2020", "title": "Evaluating NCHW vs NHWC Layouts", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you leave the ResNet-101 in NCHW with implicit cuDNN transposes, or switch globally to NHWC, and why?", "chain_ids": ["cloud-chain-auto-secondary-017-39"], "chain_positions": {"cloud-chain-auto-secondary-017-39": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-39": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2021", "title": "TensorRT Fusion and Quantization Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the expected TensorRT-optimized batch latency after 5x kernel fusion and 4x INT8 data-movement reduction?", "chain_ids": ["cloud-chain-auto-005-12"], "chain_positions": {"cloud-chain-auto-005-12": 0}, "chain_tiers": {"cloud-chain-auto-005-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2022", "title": "TensorRT Dynamic Shapes Trade-offs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use one static 512-token FP16 engine or INT8 dynamic shapes with optimization profiles for the 10ms P99 SLA, and why?", "chain_ids": ["cloud-chain-auto-005-12"], "chain_positions": {"cloud-chain-auto-005-12": 2}, "chain_tiers": {"cloud-chain-auto-005-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2023", "title": "Debugging TensorRT Precision Fallbacks and Fusion Breaks", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the FP16 TensorRT engine 2x slower than expected, and how would you eliminate the formatting nodes?", "chain_ids": ["cloud-chain-auto-005-12"], "chain_positions": {"cloud-chain-auto-005-12": 1}, "chain_tiers": {"cloud-chain-auto-005-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2024", "title": "Evaluating Test-Time Compute vs Model Scaling", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 500-token reasoning outputs under 10s P95, should you upgrade to a 400B model or run Best-of-16 on the 70B model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2025", "title": "Diagnosing KV-Cache Pressure from 8,000-Token CoT Scratchpads", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 8,000-token CoT scratchpads cause low SM utilization and OOMs, and how would you control KV-cache pressure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2027", "title": "Batched Test-Time Scaling Latency", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the end-to-end latency for the Best-of-16 request with prefix caching, and how does it compare to Best-of-1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2028", "title": "Diagnosing and Mitigating Equal Opportunity Violations", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you equalize the minority group’s TPR to 92% without retraining, and what happens to the review queue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2029", "title": "Post-Hoc Threshold Adjustment for Equal Opportunity", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would post-hoc threshold adjustment equalize TPRs for Groups X and Y, and what trade-offs does it create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2030", "title": "The Embedding Index Memory Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory does just the raw vector data require, before any graph overhead?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 0}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2031", "title": "The ANN Recall-Latency Tradeoff", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does improving recall have such a steep latency cost, and what is the fundamental tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2032", "title": "The RAG Latency Composition", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total end-to-end time to first token, and what is the total request time?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 1}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2033", "title": "The Hybrid Search Score Fusion Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you combine these heterogeneous score distributions for meaningful ranking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2034", "title": "The Embedding Dimension Cost Curve", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What infrastructure cost difference does the 1536-dim embedding model create for a 100M-document index versus 384-dim?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2035", "title": "The RAG Context Window Overflow", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the primary cause of the degraded answer quality despite being within the context window limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2036", "title": "The Vector Index Staleness Window", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the systems root cause, and what architectural pattern solves it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2037", "title": "The Cascading Retry Storm", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the system go down for 20 minutes when the LLM only spiked for 3 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2038", "title": "The Agent Tool-Call Latency Budget", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many sequential tool calls can you afford, and how do you design the system to stay within budget?", "chain_ids": ["cloud-chain-auto-016-05"], "chain_positions": {"cloud-chain-auto-016-05": 0}, "chain_tiers": {"cloud-chain-auto-016-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2039", "title": "The Embedding Cache Hit Rate Cliff", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did scaling users destroy your cache economics, and how do you fix the hit rate collapse?", "chain_ids": ["cloud-chain-auto-016-04"], "chain_positions": {"cloud-chain-auto-016-04": 0}, "chain_tiers": {"cloud-chain-auto-016-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2040", "title": "The Reranker Bottleneck Inversion", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the extra 210ms coming from?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 2}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2041", "title": "The Multi-Model GPU Packing Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can you fit all 4 models on one GPU, and what happens if you try?", "chain_ids": ["cloud-chain-auto-016-02"], "chain_positions": {"cloud-chain-auto-016-02": 2}, "chain_tiers": {"cloud-chain-auto-016-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2042", "title": "The Product Quantization Memory Tradeoff", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much memory does the PQ-compressed index require, and what is the recall cost?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 1}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2043", "title": "The DAG Critical Path Optimization", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which investment has higher ROI given the current performance against the 2.5s SLA?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 3}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2044", "title": "The RAG Cache Invalidation Dilemma", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does this update propagate through the caching system, and what is the correct invalidation strategy?", "chain_ids": ["cloud-chain-auto-016-04"], "chain_positions": {"cloud-chain-auto-016-04": 2}, "chain_tiers": {"cloud-chain-auto-016-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2045", "title": "The Vector DB Sharding Strategy", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Are range-based or hash-based shards enough, or should the vector DB use locality-aware sharding?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 2}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2046", "title": "The Compound System Evaluation Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you isolate which component caused the regression, and why is this fundamentally harder than debugging a single model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2047", "title": "The Agent Loop Cost Explosion", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the daily inference cost, and how does it compare to a non-agentic RAG system?", "chain_ids": ["cloud-chain-auto-016-05"], "chain_positions": {"cloud-chain-auto-016-05": 1}, "chain_tiers": {"cloud-chain-auto-016-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2048", "title": "The Tool-Use Timeout Cascade", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the probability that this 2-tool sequence completes within the timeout, and how should you set per-tool timeouts?", "chain_ids": ["cloud-chain-auto-016-05"], "chain_positions": {"cloud-chain-auto-016-05": 2}, "chain_tiers": {"cloud-chain-auto-016-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2049", "title": "The Semantic Cache Collision Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you fix semantic caching without losing its cost benefits?", "chain_ids": ["cloud-chain-auto-016-04"], "chain_positions": {"cloud-chain-auto-016-04": 1}, "chain_tiers": {"cloud-chain-auto-016-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2050", "title": "HNSW Scaling Limits: Latency and Recall Collapse at 1B Vectors", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does 100x more data cause both latency and quality to degrade, and what architectural changes are needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2051", "title": "The Embedding Model Drift Crisis", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you manage the transition to the new embedding model without degrading retrieval quality or causing downtime?", "chain_ids": ["cloud-chain-auto-016-03"], "chain_positions": {"cloud-chain-auto-016-03": 3}, "chain_tiers": {"cloud-chain-auto-016-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2052", "title": "The Compound System Tail Latency Amplification", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the junior engineer's P99 analysis correct, and what should you do to meet the 500ms P99 target?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 4}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2053", "title": "The Multi-Agent Consistency Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the 3-agent financial analysis system to show progressive results while preserving factual consistency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2054", "title": "The Model Cascade Routing Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What routing strategy achieves 95% effective accuracy while minimizing cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2055", "title": "The RAG vs Fine-Tuning Breakeven Analysis", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach has lower TCO, and at what query volume does the breakeven point shift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2056", "title": "ML vs Software Development Lifecycle", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the fundamental difference between traditional software development and ML development that makes the standard software lifecycle insufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2057", "title": "The Experiment Tracking Storage Budget", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much storage does your experiment tracking system consume per year, and at what point does this become an infrastructure concern?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 0}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2058", "title": "The Iteration Speed Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much does this improve the overall iteration cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2059", "title": "The Feedback Loop Latency Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is this feedback loop asymmetry the single biggest productivity bottleneck in ML engineering?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 0}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2060", "title": "The Wasted Training Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the monthly dollar cost of failed experiments?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2061", "title": "Shift-Left Validation for Serving Constraints", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What pre-training validation would prevent these failed mobile runs, and how much would it save at 10 models per month?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2062", "title": "Replacing Grid Search with Bayesian Optimization", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is wrong with the grid search approach, and how do you reduce the budget while maintaining similar coverage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2063", "title": "The Offline-Online Metric Gap", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the three most likely causes of this offline-online metric gap?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 1}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2064", "title": "The Data Cascade Failure", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why are data cascades particularly dangerous in ML workflows, and what structural practice would have caught this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2065", "title": "The Experiment Metadata Tax", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What went wrong, and what minimum metadata must be tracked per experiment to prevent this?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 1}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2066", "title": "Early Stopping vs Checkpoint Recovery", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "After 12 hours of rising validation loss, should you stop at the hour-60 checkpoint or continue with a lower learning rate, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2068", "title": "The Silent Training Regression", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the 40% support-ticket spike after a weekly retrain when aggregate validation accuracy looked normal?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 2}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2069", "title": "The Multi-Objective Experiment Frontier", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Of those 12 feasible experiments, how do you systematically choose the best model?", "chain_ids": ["cloud-chain-auto-011-08"], "chain_positions": {"cloud-chain-auto-011-08": 3}, "chain_tiers": {"cloud-chain-auto-011-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2070", "title": "The Pipeline Debt Diagnosis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the three most likely sources of the 2-to-8-week production slowdown as the team grew to 15 engineers, and how would you fix them?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 2}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2072", "title": "The Experiment Leakage Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Before celebrating, what should you check first, and why does the training curve shape suggest a specific failure mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2073", "title": "The GPU Cluster Utilization Mystery", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can GPUs be 65% idle while engineers wait 12 hours, and what scheduling changes would fix the cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2074", "title": "The Canary Evaluation Framework", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What metrics must an automated evaluation gate track, what thresholds should trigger rollback, and how do you handle the tension between deployment velocity and safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2075", "title": "The Training Pipeline Observability Gap", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What instrumentation should have been in place, and how would you make the kill-vs-continue decision with only the training loss curve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2076", "title": "The End-to-End Iteration Tax Audit", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Where do you invest engineering effort, and how do you achieve a 3x speedup when every phase seems equally important?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2077", "title": "The ML Platform Architecture Decision", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you design the platform architecture, and what is the hardest technical challenge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2078", "title": "The Experiment-to-Production Gap", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why do the reported compute, latency, and accuracy gaps exist, and how does the ML development lifecycle fundamentally produce them?", "chain_ids": ["cloud-chain-auto-003-15"], "chain_positions": {"cloud-chain-auto-003-15": 3}, "chain_tiers": {"cloud-chain-auto-003-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2079", "title": "The LLM Evaluation Pipeline Design", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design an evaluation pipeline that provides high-confidence quality signals within a 2-day deployment cadence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2080", "title": "The Retraining Trigger Strategy", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate fixed-schedule, performance-triggered, and drift-triggered retraining for a high-stakes medical diagnosis model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2081", "title": "The Training Failure Recovery Architecture", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a resilient training workflow to minimize wasted compute from GPU faults, OOMs, NaNs, and data stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2082", "title": "Attention's Quadratic Memory Wall", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If you double the sequence length from 2K to 4K tokens, by how much does the attention score matrix memory grow?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2x — attention memory scales linearly with sequence length. (Trap: linear scaling of KV cache applied to attention matrix)", "4x — the attention score matrix is S x S, so doubling S quadruples memory.", "8x — attention has cubic scaling due to the three projection matrices Q, K, V. (Trap: confusing N^3 matrix multiplication complexity with N^2 memory complexity)", "It stays the same — attention memory depends only on model dimension, not sequence length. (Trap: confusing model parameters with activation memory)"], "correct_index": 1}}, {"id": "cloud-2083", "title": "CNN vs Transformer Arithmetic Intensity", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Which layer typically has higher arithmetic intensity (FLOPs per byte), and what does that imply about their hardware bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Attention has higher arithmetic intensity because it involves more matrix multiplies.", "Convolutions have higher arithmetic intensity due to massive weight reuse across spatial positions.", "They have identical arithmetic intensity since both perform dot products.", "Neither is meaningful to compare because arithmetic intensity only applies to fully-connected layers."], "correct_index": 1}}, {"id": "cloud-2084", "title": "Embedding Table Bandwidth Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is this workload almost always memory-bandwidth-bound rather than compute-bound?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 0}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Because the FP32 multiply-accumulate operations for 128-dim vectors are expensive on GPU cores.", "Because the model's MLP layers after the embedding are compute-heavy.", "Because each lookup is a random memory read with near-zero arithmetic — making it a pure bandwidth-bound workload.", "Because the embedding table doesn't fit in L2 cache, forcing reads from system DRAM."], "correct_index": 2}}, {"id": "cloud-2085", "title": "RNN Sequential Dependency vs GPU Parallelism", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why did transformers largely replace RNNs in production NLP systems, from a systems perspective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["RNNs require more total FLOPs than transformers for the same sequence length.", "RNN hidden states create sequential dependencies that prevent parallel execution across timesteps, severely underutilizing GPU hardware.", "Transformers use less memory than RNNs because they don't store hidden states.", "GPU hardware is physically incapable of running recurrent operations."], "correct_index": 1}}, {"id": "cloud-2086", "title": "MoE AllToAll Communication Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the AllToAll communication volume per MoE layer for 4096 top-2-routed BF16 tokens across 8 GPUs?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 0}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 MB — each GPU only sends tokens it can't process locally.", "~64 MB — tokens dispatched once to the selected experts.", "~128 MB — tokens dispatched to top-2 experts and results returned.", "~512 MB — all tokens are broadcast to all 8 GPUs."], "correct_index": 2}}, {"id": "cloud-2087", "title": "KV Cache Memory Per Token", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the BF16 KV cache memory per token with 8 KV heads, and the total cache for a 4096-token context?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 0}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~40 KiB per token, ~160 MiB for 4K context.", "~320 KiB per token, ~1.25 GiB for 4K context.", "~2.5 MiB per token, ~10 GiB for 4K context.", "~5 MiB per token, ~20 GiB for 4K context."], "correct_index": 1}}, {"id": "cloud-2088", "title": "im2col Memory Expansion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does im2col transform convolution into matrix multiplication, and what is the memory overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["im2col compresses the convolution into a 1D vector, saving memory.", "im2col creates a Toeplitz matrix that has the same size as the original input.", "im2col unrolls overlapping receptive fields into columns, duplicating data up to 9x for 3x3 filters.", "im2col only affects compute time, not memory — it reindexes data in-place."], "correct_index": 2}}, {"id": "cloud-2089", "title": "Flash Attention Tiling Strategy", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the core systems insight behind Flash Attention's speedup, and why does SRAM tiling change the bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-013-05"], "chain_positions": {"cloud-chain-auto-secondary-013-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2090", "title": "MoE Expert Load Imbalance Stall", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely root cause of this synchronization stall, and what is the throughput impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2091", "title": "Prefill vs Decode Compute Profile", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is LLM prefill compute-bound while decode is memory-bandwidth-bound when viewed through arithmetic intensity?", "chain_ids": ["cloud-chain-auto-006-02"], "chain_positions": {"cloud-chain-auto-006-02": 0}, "chain_tiers": {"cloud-chain-auto-006-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2092", "title": "GQA vs MQA KV Cache Tradeoff", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do MHA, GQA, and MQA compare in KV cache size and serving throughput implications for a 30B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2093", "title": "SSM vs Transformer Hardware Tradeoff", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When does each architecture have a systems advantage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2094", "title": "Transformer Layer FLOP Decomposition", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the attention and FFN FLOPs break down for this layer, and which component dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2095", "title": "DLRM Embedding Table Sharding Strategy", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you shard the 400 GB embeddings across the 8 GPUs, and why not shard every table evenly across all GPUs?", "chain_ids": ["cloud-chain-auto-006-05"], "chain_positions": {"cloud-chain-auto-006-05": 3}, "chain_tiers": {"cloud-chain-auto-006-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2096", "title": "MoE Sparse vs Dense FLOP Equivalence", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do Mixtral 8x7B per-token FLOPs compare to dense 47B and 13B models, and why does MoE get more capacity for fewer FLOPs?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 2}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2097", "title": "Attention GPU Utilization Drop at Long Sequences", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does prefill GPU utilization drop from 55% on 256-token prompts to 25% on 16K-token prompts despite more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2098", "title": "MoE Capacity Factor Tuning", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you reason about the optimal CF?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2099", "title": "Transformer Architecture Enables Tensor Parallelism", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do multi-head attention and FFN layers naturally decompose across GPUs in Tensor Parallelism, and where are the synchronization points?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2100", "title": "Sparse Attention Patterns Meet Hardware Reality", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the theoretical FLOP reduction not translate to proportional wall-clock speedup?", "chain_ids": ["cloud-chain-auto-secondary-013-05"], "chain_positions": {"cloud-chain-auto-secondary-013-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2101", "title": "Embedding Table Hot-Cold Partitioning", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a tiered caching architecture for the 200 GB embedding table using the 80/20 Zipfian access pattern?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2102", "title": "MoE Expert Parallelism vs Tensor Parallelism", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When should you use expert parallelism versus tensor parallelism for 128-expert MoE inference, and what are the tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2103", "title": "Architecture Choice Drives Serving Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture minimizes $/query between a dense 70B transformer vs a 47B-parameter MoE (8x7B, top-2) vs a 70B-equivalent SSM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2105", "title": "DLRM Training Pipeline End-to-End Design", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are the three critical systems bottlenecks for social-scale DLRM training with hourly updates, and how should the architecture address them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2106", "title": "Multi-Architecture Serving Fleet Design", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you allocate 256 GPUs and choose parallelism strategies to serve the 175B dense transformer, 500B MoE, and 2 TB DLRM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2107", "title": "KV Cache Quantization Quality-Throughput Frontier", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What KV-cache quantization system would you deploy to approach 3.5× throughput while keeping quality loss under 1%?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 3}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2108", "title": "MoE Inference Memory Efficiency Problem", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you improve HBM memory utilization for the 128-expert 800B MoE without degrading inference latency?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 4}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2109", "title": "The Matrix Multiply FLOP Count", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many floating-point multiply-accumulate operations (MACs) and total FLOPs does this layer require?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2110", "title": "Arithmetic Intensity of a Linear Layer", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of Y=XW for batch size B, and when is it compute-bound versus memory-bound on an A100?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2112", "title": "Gradient Checkpointing: The Memory-Compute Trade", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With gradient checkpointing every 10 layers, what activation memory and training-time compute overhead should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2113", "title": "Forward vs Reverse Mode Autodiff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why would forward-mode AD be catastrophically expensive for a 1B-parameter scalar-loss transformer, and when is it preferable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2114", "title": "The Dying ReLU Diagnosis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the root cause of the 87% zero activations, and what are two systems-aware mitigations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2115", "title": "The LayerNorm Bandwidth Tax", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does LayerNorm take 8% of wall time despite only 0.1% of FLOPs, and why is kernel fusion critical?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2116", "title": "Computational Graph Memory Lifetime Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do transformer activation lifetimes differ from a strict LIFO stack, and why does that matter for GPU memory fragmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2117", "title": "Vanishing Gradients and Depth Limits", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What causes the layer-1 gradient to be 1e-15 versus 0.1 at layer 50, and what systems consequence does it have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2119", "title": "The Hidden Cost of Dynamic Graphs", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the actual bottleneck causing the GPU launch gaps, and what two concrete fixes would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2121", "title": "Designing Mixed-Precision Backpropagation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should tensors flow through the 70B LLM mixed-precision forward-backward-update cycle, including FP32 loss, loss scaling, and master weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2122", "title": "Exploiting Activation Sparsity in FFN Layers", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will 90% ReLU activation sparsity yield a near-10x FFN speedup by skipping zero rows, and what hardware obstacles limit it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2123", "title": "BatchNorm Breaks in Distributed Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ResNet-50 accuracy drop when local batch size falls from 256 to 4 across 64 GPUs, and what are two fixes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2124", "title": "Designing an Autograd Engine From Scratch", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the autograd engine's data structures and backward algorithm, and why is topological sort essential?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2125", "title": "Designing Around Non-Differentiable Operations", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you train through discrete decisions like hard MoE routing end-to-end, and what systems cost does your approach add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2126", "title": "Optimal Recompute-vs-Save Scheduling", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate activation checkpoint selection with a 33% recompute budget, and what practical algorithm would you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2127", "title": "The Feature Store Latency Budget", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can you serve this model within the SLA using sequential feature lookups, and why?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 0}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2128", "title": "The Embedding Table Memory Wall", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory do the embedding tables require, and can they fit on a single A100 (80GB HBM)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2129", "title": "The Training-Serving Skew Trap", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the most likely cause of the conversion rate drop?", "chain_ids": ["cloud-chain-auto-003-14"], "chain_positions": {"cloud-chain-auto-003-14": 0}, "chain_tiers": {"cloud-chain-auto-003-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2130", "title": "The Batch vs Real-Time Feature Tradeoff", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is this architecture fundamentally inadequate for this feature, and what is the minimum viable real-time path?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 0}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2131", "title": "The Feature Backfill Cost", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What infrastructure cost is needed to backfill user_lifetime_spend_percentile for 100M users over 180 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2132", "title": "The Embedding Lookup Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "A junior engineer suggests upgrading to a GPU with 2x more TFLOPS. Will this help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2133", "title": "The Feature Interaction Explosion", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many pairwise feature crosses do 500 categorical features create, and what is the systems impact?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2134", "title": "The Point-in-Time Join Correctness", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What went wrong with the feature join, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2135", "title": "The Feature Freshness vs Cost Tradeoff", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you tier the 300 features to capture 90% of freshness value at about 20% of the $2M/year real-time cost?", "chain_ids": ["cloud-chain-auto-020-15"], "chain_positions": {"cloud-chain-auto-020-15": 2}, "chain_tiers": {"cloud-chain-auto-020-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2136", "title": "The Embedding Sharding Strategy", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does row-parallel sharding cause massive communication bottlenecks here, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2137", "title": "The Streaming Feature Consistency Trap", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you ensure exactly-once semantics for streaming features to prevent double-counting during rebalances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2138", "title": "The Feature Store Hot Key Problem", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you fix this single-shard saturation without re-architecting the entire feature store?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 2}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2139", "title": "The Feature Version Mismatch", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design feature versioning so models trained on min-max values cannot silently receive z-score values?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 2}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2140", "title": "The GPU Preprocessing Offload", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you move the 500-feature preprocessing pipeline to the GPU, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2141", "title": "The Unified Feature Store Architecture", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a feature store that guarantees training-serving consistency for 30 models by construction rather than by testing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2142", "title": "The Silent Feature Pipeline Failure", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you debug the 3-week conversion decline when systems dashboards are green but a feature pipeline issue is suspected?", "chain_ids": ["cloud-chain-auto-003-14"], "chain_positions": {"cloud-chain-auto-003-14": 2}, "chain_tiers": {"cloud-chain-auto-003-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2143", "title": "The Feature Serving Latency Decomposition", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you break down the 95ms feature-serving overhead and optimize P99 below the 100ms SLA?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 3}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 95ms is dominated by network latency; upgrade from 10 Gbps to 25 Gbps NICs and add edge caching to reduce RTT from 95ms to under 75ms", "Replace the feature store with an in-GPU embedding cache to eliminate all feature retrieval latency, reducing the pipeline to model inference time only (~25ms)", "Increase the SLA to 150ms since 120ms P99 is already competitive; feature retrieval latency is inherently sequential and cannot be parallelized across different feature sources", "Decompose the 95ms into feature store lookup (~25ms P99), real-time feature computation (~45ms P99), serialization (~5ms), and network RTT (~20ms); parallelize batch and real-time feature fetches to reduce total to ~95ms, meeting the 100ms SLA"], "correct_index": 3}}, {"id": "cloud-2144", "title": "The Multi-Model Feature Platform", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a centralized feature platform for 50 models that shares computation while preserving team independence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2145", "title": "Tiered Embedding Serving for 5ms P99 Lookups", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a 5ms-P99 embedding lookup service for 10M QPS over 2.56TB of embeddings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a 3-tier architecture: in-process hot cache (top 0.1% rows, ~2.56 GB, <0.1ms), distributed warm cache (top 5% rows, ~128 GB Redis, ~1ms), and SSD-backed cold store (full 2.56 TB, ~4ms budget); validate that the measured P99 path stays under 5ms across about 200 serving nodes", "Serve all 2.56 TB from GPU HBM across 32 A100s (80 GB each) for sub-millisecond latency; GPU memory bandwidth of 2 TB/s can handle the 25.6 GB/s aggregate read load easily", "Shard the 2.56 TB across 256 Redis nodes (10 GB each) with consistent hashing; Redis provides ~0.5ms P99 per lookup, meeting the 5ms SLA without any tiered caching", "Use a single 2.56 TB NVMe-backed server because the corpus fits on one machine, ignoring the 50M lookups/s fan-in, network fanout, and request-level P99 tail latency"], "correct_index": 0}}, {"id": "cloud-2146", "title": "The End-to-End Feature Pipeline Redesign", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "With 6 months and 4 engineers, how would you architect, migrate, and validate a system that drives training-serving skew toward zero?", "chain_ids": ["cloud-chain-auto-011-14"], "chain_positions": {"cloud-chain-auto-011-14": 3}, "chain_tiers": {"cloud-chain-auto-011-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2148", "title": "The Gradient Accumulation Trick", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Without adding more GPUs, how do you achieve this effective batch size, and what is the computational cost?", "chain_ids": ["cloud-chain-auto-005-15"], "chain_positions": {"cloud-chain-auto-005-15": 0}, "chain_tiers": {"cloud-chain-auto-005-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2150", "title": "The Loss Spike at Step 50K", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the three most likely root causes of the loss spike, ordered from most to least common?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2151", "title": "Adam Warmup for LLM Pretraining", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is warmup critical for large models with Adam, and how do you calculate a reasonable warmup duration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2152", "title": "The FP16 Loss Scaling Dance", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What happened, and what mechanism should have prevented this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2153", "title": "The Activation Checkpointing Tradeoff", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the memory-compute tradeoff, and how do you decide which layers to checkpoint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2154", "title": "The Batch Size Scaling Wall", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did scaling GPT training to 512 GPUs and 8M-token batches hurt final quality despite faster steps and linear LR scaling?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 2}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2155", "title": "The Training Memory Budget Decomposition", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What parameter ceiling can the memory budget support, and what activation profiling is still needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2156", "title": "85% Gradient Clipping Rate", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Is this a healthy training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2158", "title": "The Pre-Training Cost Estimate", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is a realistic cost estimate to pre-train a 70B model on 2T tokens at $3/GPU-hour, including failure overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2159", "title": "The Slow Training Step Diagnosis", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you profile a 70B run on 256 GPUs to find why steps take 12s instead of the expected 4s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2160", "title": "The Chinchilla Scaling Decision", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option should you choose: 70B on 500B tokens, 25B on 500B tokens, or repeating the 500B tokens about 3 times, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Train a 25B model on 500B tokens to be Chinchilla-optimal; this minimizes training loss per FLOP and produces the most compute-efficient model within the data constraint", "Train the 70B model on 500B tokens (Chinchilla-undertrained) because the larger model may deliver better downstream capability when inference cost is acceptable", "Train the 70B model and repeat the 500B tokens 3x (1.5T tokens) to match the Chinchilla-optimal token count; this spends compute on repeated data and risks diminishing returns", "Split the budget: train a 70B model on 500B tokens, then distill it into a 7B model for serving; this gives both quality and inference efficiency without wasting compute"], "correct_index": 1}}, {"id": "cloud-2161", "title": "The Gradient Accumulation vs. Data Parallelism Choice", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the systems-level tradeoffs between large micro-batches (A) and smaller micro-batches with gradient accumulation (B)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2162", "title": "405B Pretraining Divergence at Step 80K", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you systematically diagnose and recover from this reproducible loss divergence around step 80K?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2163", "title": "Training Recipe for a 40B Model on 1,024 GPUs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What complete training configuration would you choose for the 40B model on 1,024 GPUs, and how would you justify it quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use BF16 AdamW with FP32 master weights, ZeRO-3/FSDP sharding, TP=4 within nodes, activation checkpointing, a 512K-to-2-4M token batch ramp, WSD learning-rate schedule, and async checkpointing", "Train entirely on one data-parallel group with no tensor parallelism or activation checkpointing because ZeRO-3 makes the 640 GB optimizer state only 0.625 GB/GPU", "Use FP16 without loss scaling and no FP32 master weights to minimize memory, then compensate for instability by lowering the learning rate after divergence", "Skip frequent checkpoints because 1,024 GPUs make training fast enough that restart cost is smaller than checkpoint I/O overhead"], "correct_index": 0}}, {"id": "cloud-2164", "title": "Sparse Continual Pre-Training for a 70B Medical LLM", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you use sparse or adapter-based continual pre-training on 100B medical tokens without benchmark regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2165", "title": "CUDA Warp Size: Why 100 Threads per Block Is Inefficient", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many threads actually execute per scheduling unit on an NVIDIA GPU, and why does launching 100 threads per block waste resources?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2166", "title": "The Memory Coalescing Penalty", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does Kernel A run significantly faster than Kernel B on the H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Kernel A is compute-bound, while Kernel B is memory-bound due to different arithmetic intensities.", "Kernel B causes warp divergence, forcing threads to execute sequentially instead of in parallel.", "Kernel A enables memory coalescing where a warp fetches a single 128-byte cache line, whereas Kernel B's strided access wastes up to 97% of fetched cache line bandwidth.", "Kernel B writes to read-only memory, causing L2 cache invalidations on every memory transaction."], "correct_index": 2}}, {"id": "cloud-2168", "title": "The Warp Divergence Tax", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given 65% divergence leaves 35% useful lanes and branch serialization gives 45% efficiency, why is throughput about 15% peak FLOPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2171", "title": "The Flash Attention Tiling Insight", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Flash Attention compute exact softmax without materializing the full attention matrix, and what memory hierarchy level does it exploit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2172", "title": "The Kernel Launch Overhead Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you eliminate kernel launch overhead, and what are the tradeoffs of each approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2173", "title": "The Occupancy vs IPC Tradeoff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the lower-occupancy 64x64-tile GEMM run 2x faster, and when should you intentionally sacrifice occupancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2174", "title": "The GPU Memory Hierarchy Bandwidth Stack", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is skipping shared memory and using L2 directly sound for this memory-bound H100 kernel, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2175", "title": "GEMV vs GEMM Dataflow for M=1 and M=1024", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which dataflow minimizes total data movement for this specific GEMV shape, and how does the answer change when M increases to 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2176", "title": "The CUDA Streams Overlap Failure", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did the three CUDA streams serialize, and what synchronization design enables proper overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2178", "title": "Triton Fusion for RMSNorm, RoPE, and Attention Scores", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tiling strategy, work partitioning, HBM traffic reduction, and primary risk would you choose for this fused Triton kernel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2179", "title": "The Thread Block Scheduling Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does increasing the H100 grid from 264 to 265 blocks raise kernel time from 1.0 ms to 1.5 ms, and how should grids be sized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2180", "title": "The Ring AllReduce Bandwidth Cost", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much total data does each GPU send and receive during one AllReduce operation?", "chain_ids": ["cloud-chain-auto-002-01"], "chain_positions": {"cloud-chain-auto-002-01": 0}, "chain_tiers": {"cloud-chain-auto-002-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2181", "title": "The Collective Primitive Confusion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the semantic differences between AllGather, ReduceScatter, and AllReduce, and why does FSDP use them at different points?", "chain_ids": ["cloud-chain-auto-002-06"], "chain_positions": {"cloud-chain-auto-002-06": 0}, "chain_tiers": {"cloud-chain-auto-002-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2182", "title": "The Alpha-Beta Communication Model", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the alpha-beta model, what is the Ring AllReduce time for 200 MB across 32 GPUs, and where is the latency/bandwidth crossover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2183", "title": "The Ring vs Tree Algorithm Selection", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does NCCL use Tree AllReduce below 256 KB and Ring AllReduce for larger buffers on the 64-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2184", "title": "The Gradient Bucket Fusion Tradeoff", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Who is right, and what is the real tradeoff being managed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2185", "title": "The Overlap Efficiency Ceiling", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Where did the remaining 10 ms of execution time come from if communication is fully overlapped?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 10 ms is overhead from SM contention, memory bandwidth contention, and CUDA synchronization introduced by overlapping communication and computation.", "The AllReduce operation requires exactly 10 ms of pure CPU processing before it can be offloaded to the NCCL backend.", "The overlap completely failed to execute, so the system fell back to sequential execution, and the 10 ms is the time taken to check for the failure.", "Communication-computation overlap requires an additional data copy to system RAM, which takes exactly 10 ms."], "correct_index": 0}}, {"id": "cloud-2186", "title": "The Hierarchical AllReduce Asymmetry", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is flat Ring AllReduce slow for 2 GB gradients across 256 GPUs, and what topology-aware algorithm would you use instead?", "chain_ids": ["cloud-chain-auto-002-02"], "chain_positions": {"cloud-chain-auto-002-02": 3}, "chain_tiers": {"cloud-chain-auto-002-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2187", "title": "The MoE AllToAll Communication Wall", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is AllToAll fundamentally more expensive than AllReduce?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2188", "title": "The Gradient Compression Convergence Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does naive Top-K gradient sparsification hurt convergence, and what mechanism is needed to make it work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2189", "title": "The NCCL Collective Deadlock", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did this cause a deadlock, and what invariant did the model code violate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2190", "title": "The Bandwidth-Optimal Lower Bound", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the AllReduce bandwidth lower bound, does Ring achieve it, and can any large-message algorithm beat Ring on a fully connected topology?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2192", "title": "The In-Network Reduction Promise", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the networking team's claim that SHARP will cut AllReduce time in half valid, and when does SHARP help most or least?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2194", "title": "The 100K-GPU Collective Breakdown", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What collective communication architecture would you design for 100,000 GPUs to handle DP AllReduce, FSDP sharding, and MoE AllToAll?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2195", "title": "The Roofline Diagnostic", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is this kernel compute-bound or memory-bound, and what is its theoretical attainable performance according to the roofline model?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 0}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2196", "title": "The Micro-Benchmark Mirage", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why did a 2.5x attention kernel micro-benchmark speedup yield only an 8% end-to-end Llama-70B throughput gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2197", "title": "The Statistically Invalid Benchmark", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you determine whether this improvement is statistically significant, and what is the minimum number of runs needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2198", "title": "The Benchmark Gaming Red Flags", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What benchmark gaming techniques should you suspect, and how does MLPerf's compliance framework attempt to prevent them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The red flags are valid benchmark-specific optimizations, but MLPerf compliance means the result is automatically representative of production training cost and convergence behavior.", "Three gaming red flags: accuracy cliff-hanging, unrealistic 128K batch size, and hyperparameter overfitting with a custom LR schedule. MLPerf compliance mitigates this by defining closed-division benchmark rules for the model, dataset, optimizer family, quality target, and reporting requirements while separating open-division submissions.", "The main red flag is the custom learning-rate schedule; MLPerf compliance primarily solves this by banning schedule tuning and requiring vendors to use one reference hyperparameter file.", "The main issue is that the submission belongs in open division, because closed division allows no batch-size scaling or implementation tuning beyond the reference system."], "correct_index": 1}}, {"id": "cloud-2199", "title": "The Right Profiler for the Job", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "In what order do you use these tools, and what does each one tell you that the others cannot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2200", "title": "The Power Measurement Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What went wrong with the power measurement methodology?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The measurement estimated 350 kW, but thermal mass smoothing reduces the actual required cooling to 200 kW.", "The measurement accurately captured 350 kW of GPU power, requiring 350 kW of cooling.", "The measurement missed sustained prefill bursts and system overheads, severely underestimating the true peak load of 735 kW.", "The measurement incorrectly included PUE, meaning the actual required capacity is only 250 kW."], "correct_index": 2}}, {"id": "cloud-2201", "title": "The Nsys Timeline Mystery", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the three most likely causes of this gap, and how do you distinguish between them using the nsys timeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2202", "title": "The MFU vs HFU Confusion", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the distinction between MFU and HFU matter, and which metric should you trust for comparing training configurations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["MFU and HFU are identical metrics; 62% is valid regardless.", "62% is likely HFU; true MFU ≈ 47% (accounting for ~33% recompute). MFU normalizes out implementation choices.", "62% is likely MFU; true HFU ≈ 82% (adding 33% recompute to 62%).", "62% is likely nvidia-smi utilization; true MFU is >80%."], "correct_index": 1}}, {"id": "cloud-2203", "title": "FFN Fusion Exposes AllReduce Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does Conservation of Complexity explain the FFN fusion's 12% step-time reduction and AllReduce becoming the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2204", "title": "Debugging Low MFU on a 64-GPU H100 Training Job", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you debug a 64-GPU job with healthy GEMMs, NCCL, and data pipeline but only 35% MFU instead of the expected 55%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Generate an nsys timeline and look for serialization, micro-bubbles between kernel launches, and straggler GPUs causing idle time before AllReduce barriers.", "Switch from a Ring AllReduce to a Tree AllReduce topology to immediately resolve the 20% MFU gap.", "Focus exclusively on the data loader; if it is not stalling, the only other possibility is that the CPU is overheating.", "Increase the batch size by 4x to force the GPU to become compute-bound, ignoring the network communication overhead."], "correct_index": 0}}, {"id": "cloud-2205", "title": "The MLPerf Division Dilemma", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which MLPerf Inference division should the VP prioritize, Open or Closed, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Open division, because 3x faster INT4 quantization proves the hardware's superior raw compute capability.", "Open division, because hyperscalers only care about the absolute lowest latency (3x faster).", "Closed division, because it constrains models and allows direct hardware comparisons for procurement.", "Closed division, because custom model architectures highlight specific tensor core advantages."], "correct_index": 2}}, {"id": "cloud-2206", "title": "The Energy Roofline", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which config has higher tokens per joule, and how would an energy roofline show whether the workload is compute-, bandwidth-, or power-limited?", "chain_ids": ["cloud-chain-auto-005-09"], "chain_positions": {"cloud-chain-auto-005-09": 3}, "chain_tiers": {"cloud-chain-auto-005-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2207", "title": "Designing a Benchmark Suite for LLM Inference", "topic": "compound-ai-systems", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What limitations does the current benchmark have, and how would you design a suite that tests real-world production dimensions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2208", "title": "The Silent Performance Regression", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design an observability and benchmarking system to detect, attribute, and prevent a gradual 15% LLM P50 latency regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2209", "title": "The Dominant Resource Fairness Intuition", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does this naive split waste cluster resources, and what is the core idea behind Dominant Resource Fairness (DRF)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Naive scheduling equally divides all resources, leaving 40 GPUs and 380 CPUs stranded because each team hits their other resource limit first. DRF equalizes the allocated fraction of each team's bottleneck resource (CPU for A, GPU for B), allocating 16 jobs to A and 20 to B for ~100% cluster utilization.", "Naive scheduling over-allocates GPUs to Team A. DRF ensures that both teams receive an equal number of GPUs, regardless of their CPU requests, ensuring strict GPU fairness.", "Naive scheduling leads to CPU starvation for Team B. DRF resolves this by statically splitting the cluster into CPU-heavy and GPU-heavy partitions, achieving 50% utilization.", "Naive scheduling works well but requires manual tuning. DRF dynamically scales the CPU frequency based on GPU utilization, eliminating stranded resources."], "correct_index": 0}}, {"id": "cloud-2210", "title": "The MIG Partitioning Tradeoff", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the fundamental tradeoff MIG introduces, and why can you not simply get 7x the jobs from each GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2211", "title": "The Backfill Scheduling Gap", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does backfill scheduling work here, and how much of the 256 idle GPU-hours does it recover?", "chain_ids": ["cloud-chain-auto-021-07"], "chain_positions": {"cloud-chain-auto-021-07": 0}, "chain_tiers": {"cloud-chain-auto-021-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2212", "title": "The Rail-Optimized Placement Problem", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did the placement cause the AllReduce to be 3x slower, and what scheduling constraint would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2213", "title": "The Heterogeneous Fleet Capacity Plan", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you plan capacity for the new serving tier without buying new hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2214", "title": "The Deadline-Aware Scheduling Inversion", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling principle did SJF violate when handling the 50 evaluation jobs, and how should the scheduler be redesigned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["SJF optimizes for maximum cluster utilization, which inherently delays short jobs. The scheduler should use First-In-First-Out (FIFO) to guarantee fairness.", "SJF optimizes for average completion time but ignores job deadlines and slack. The scheduler should be redesigned to use Earliest Deadline First (EDF) or a slack-aware policy so urgent jobs are prioritized.", "SJF failed because it was unaware of the GPUs' memory capacity, scheduling too many evaluation jobs on the same node. The scheduler should be redesigned to be memory-aware.", "SJF correctly optimized the workload, and the deadline miss is an unavoidable consequence of bursty traffic. No scheduler redesign is needed; the cluster just needs more GPUs."], "correct_index": 1}}, {"id": "cloud-2216", "title": "The NUMA-Unaware Inference Regression", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the Kubernetes scheduler missing that explains the 40% latency increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2217", "title": "The Multi-Tenant Noisy Neighbor", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What shared resource is the training job contending for?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2218", "title": "The Opportunistic Training Checkpoint Race", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you redesign the preemption protocol to guarantee the 60-second SLA while minimizing lost training work?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 1}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2219", "title": "The Power Density Ceiling", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does the scheduler need to account for power as a schedulable resource, and what is the maximum number of GPUs you can realistically schedule at peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2220", "title": "The Heterogeneous Workload Placement Matrix", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which workload goes on which GPU type to maximize fleet-wide cost-efficiency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2221", "title": "The GPU Fragmentation Death Spiral", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is causing the GPU fragmentation death spiral, and what multi-level fixes would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2222", "title": "The Multi-Tenant Starvation Cascade", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What scheduling policy resolves Team A's borrowed GPUs, Team B's queued 256-GPU job, and Team C's Black Friday burst without new hardware?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 3}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2223", "title": "Scheduling a 1,000-GPU Job Across H100 and B200 Fleets", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you schedule the job to maximize speed, and can both GPU types be used synchronously?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2224", "title": "The Storage Tier Latency Gap", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many orders of magnitude separate HBM access latency from Lustre, and why does that gap matter for ML training?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 0}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2225", "title": "The Lustre Stripe Throughput", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What aggregate read throughput can the 50-OST-striped file achieve, and what happens if it is stored on only 1 OST?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2226", "title": "The Checkpoint Bandwidth Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does a synchronous checkpoint take, and what fraction of training time is lost if you checkpoint every 30 minutes?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 0}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2227", "title": "The NVMe Warm Cache Miss", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the training throughput drop by 40% during the first epoch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2228", "title": "The Object Store Training Anti-Pattern", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the root cause of the 500 samples/sec bottleneck, and how should they restructure the data?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 1}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2229", "title": "NVMe Checkpoint Write Amplification", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How is a 2 TB checkpoint causing more than 2 TB of physical writes to the SSD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash SSDs cannot overwrite data in place; they must erase entire blocks. This forces the flash translation layer to read, erase, and rewrite valid pages along with new data, multiplying the physical writes (write amplification).", "The operating system caches the checkpoint in RAM and continuously flushes it to the SSD, resulting in duplicate physical writes for the same file due to fsync operations.", "PyTorch and other frameworks write checkpoints in small 4KB chunks, which misaligns with the NVMe physical page size and forces the drive to store duplicate metadata.", "The parallel filesystem driver automatically replicates the 2 TB checkpoint across multiple SSDs within the local node to ensure high availability."], "correct_index": 0}}, {"id": "cloud-2230", "title": "The Metadata Storm", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does this 'metadata storm' happen, and how do you fix it without upgrading the MDS hardware?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 1}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2231", "title": "The Checkpoint Tiering Strategy", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a tiered checkpoint strategy for the 100K-GPU run with 18 TB checkpoints that keeps GPU idle time under 10 seconds?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 3}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2232", "title": "The I/O Wall at Scale", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 35% drop in per-GPU throughput when scaling from 256 to 4,096 GPUs, given underutilized network and compute?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 2}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The shared Lustre filesystem is hitting an I/O wall; at 4,096 GPUs the aggregate read demand exceeds 80% of the 500 GB/s capacity, causing non-linear latency spikes that stall the data loader.", "The InfiniBand network is bottlenecked by the AllReduce communication of 4,096 GPUs, causing the training step to block on network transfers.", "The GPUs are thermal throttling due to the increased density of the 4,096 GPU cluster, reducing their compute utilization.", "The data pipeline requires more CPU workers to decode the images, as the CPU compute cannot keep up with the GPU demand."], "correct_index": 0}}, {"id": "cloud-2233", "title": "The Erasure Coding Overhead", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does erasure coding hurt write performance, and when is the tradeoff worth it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2234", "title": "The Data Gravity Problem", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What bottleneck caused the 200 TB S3-backed training job to slip from 40 to 55 hours, and what storage architecture fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2235", "title": "The Checkpoint Cascade Failure", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What caused this cascade, and how do you prevent it?", "chain_ids": ["cloud-chain-auto-008-04"], "chain_positions": {"cloud-chain-auto-008-04": 2}, "chain_tiers": {"cloud-chain-auto-008-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2236", "title": "The I/O Jitter Amplification", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a 500 ms storage hiccup on one or two workers slow all 16,000 GPUs by 450 ms, and how would you fix it?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 3}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The hiccup only affects the local GPU; the overall 450ms slowdown is an illusion. Fix by ignoring it.", "Synchronous AllReduce forces all GPUs to wait for the slowest worker; fix by using asynchronous parameter servers.", "The Lustre hiccup causes an interrupt storm across the InfiniBand fabric. Fix by disabling RoCE.", "Synchronous data parallelism forces a barrier where all GPUs wait for the straggler's delayed compute. Fix by increasing prefetch depth and sharding data across more OSTs."], "correct_index": 3}}, {"id": "cloud-2237", "title": "The 100K-GPU Storage Architecture", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What storage hierarchy meets the 100K-GPU cluster's 20 TB/s reads, 20 TB checkpoints, failure tolerance, and $15M budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2238", "title": "The Storage Disaggregation Dilemma", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Under what conditions is disaggregated storage the right choice, and when does converged storage win?", "chain_ids": ["cloud-chain-auto-008-05"], "chain_positions": {"cloud-chain-auto-008-05": 4}, "chain_tiers": {"cloud-chain-auto-008-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Disaggregated storage is always the right answer because pooling NVMe automatically removes checkpoint bottlenecks and tail latency does not matter.", "Converged storage is always the right answer because local NVMe is always faster and shared pools are never worth the complexity.", "Disaggregated storage is a good choice when the cluster is multi-tenant or capacity-bound and the workload can tolerate some extra storage latency. Converged local NVMe wins when checkpoint writes and random reads are in the critical path, or when low-jitter bandwidth is more important than utilization.", "Storage architecture should be chosen primarily from GPU FLOPS and model parameter count; checkpoint I/O is secondary."], "correct_index": 2}}, {"id": "cloud-2239", "title": "BSP vs Streaming Dataflow for Feature Computation", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is a pure streaming approach problematic for the historical batch features, and what execution model mismatch is at play?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2240", "title": "The Shuffle Data Volume Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much data moves across the network during the shuffle phase, and what determines whether this is a broadcast join or a shuffle join?", "chain_ids": ["cloud-chain-auto-003-12"], "chain_positions": {"cloud-chain-auto-003-12": 0}, "chain_tiers": {"cloud-chain-auto-003-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2242", "title": "The Petabyte Deduplication Shuffle Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does this approach fail at petabyte scale, and what is the standard distributed deduplication pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2243", "title": "Actor Model vs BSP for Data Processing Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Under what conditions would the actor-model approach outperform BSP for data processing, and when would it be strictly worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2244", "title": "The Missing Combiner in Distributed Aggregation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is 18 TB moving across the network to produce a 2 GB result, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2245", "title": "Bootstrapping a Quality Classifier for Trillion-Token Corpora", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bias, and how do you build a more robust quality scoring pipeline at this scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The bias is over-sampling short documents. Chunk into 512-token segments.", "The bias is language imbalance. Translate all data to English before filtering.", "The bias favors massive SEO spam. Enforce strict maximum length thresholds.", "The bias is domain-quality conflation. Use multi-signal heuristics and stratified sampling."], "correct_index": 3}}, {"id": "cloud-2246", "title": "The Curriculum Data Mixing Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you determine the optimal mixture, and what systems constraint makes this problem harder than it appears?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2247", "title": "The Streaming Deduplication State Explosion", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What caused the Flink checkpoint time to grow to 25 minutes, and what architecture handles unbounded deduplication state growth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2248", "title": "The Distributed Join Strategy Selection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which join strategy should you use for the 5 GB, 200 GB, and 2 TB lookup joins, and what is the total network I/O?", "chain_ids": ["cloud-chain-auto-003-12"], "chain_positions": {"cloud-chain-auto-003-12": 2}, "chain_tiers": {"cloud-chain-auto-003-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use Broadcast join for all three tables; this is invalid because 200 GB and 2 TB are too large to broadcast safely. Total network I/O is 2.5 TB.", "Use Sort-Merge join for all three tables as one combined pass. Total network I/O is 50 TB + 5 GB + 200 GB + 2 TB =~ 52.2 TB.", "Use Broadcast join for the 5 GB table, and Sort-Merge or Shuffle Hash join for the 200 GB and 2 TB tables. Total network I/O is roughly 2.5 TB (broadcast) + 50.2 TB (shuffle 2) + 52 TB (shuffle 3) = 104.7 TB.", "Use Shuffle Hash join for the 5 GB and 200 GB tables, and Sort-Merge join for the 2 TB table. Total network I/O is 154 TB."], "correct_index": 2}}, {"id": "cloud-2249", "title": "Designing a Multi-Signal Data Quality Pipeline at Petabyte Scale", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a 100 TB/week quality-scoring pipeline that supports cheap, rapid signal-weight ablations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2250", "title": "The Curriculum Ordering vs Random Shuffling Tradeoff", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Under what conditions does curriculum ordering beat random shuffling, and what are the systems costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2251", "title": "The Connected Components Problem in Fuzzy Deduplication", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you efficiently resolve this at the scale of 800M edges over 5B nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2252", "title": "Designing the End-to-End Pre-Training Data Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the full reproducible pipeline for the 2T-token corpus, justify the technology choices, and test the $50K/month budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2253", "title": "The Deterministic Global Shuffle at Trillion-Token Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you implement a deterministic, coordinator-free global shuffle for 2M shards and 2,048 GPUs, and handle elastic GPU-count changes?", "chain_ids": ["cloud-chain-auto-003-05"], "chain_positions": {"cloud-chain-auto-003-05": 3}, "chain_tiers": {"cloud-chain-auto-003-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2254", "title": "SSA Form in Compiler Optimization", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What property of SSA makes it essential for compiler optimization, and what would go wrong without it?", "chain_ids": ["cloud-chain-auto-005-06"], "chain_positions": {"cloud-chain-auto-005-06": 0}, "chain_tiers": {"cloud-chain-auto-005-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2255", "title": "The Dialect Hierarchy Lowering", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does an ML compiler need multiple IR levels instead of compiling directly from the computation graph to machine code?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 0}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2256", "title": "The Tiling Factor Search Space", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many exhaustive tuning trials are needed across 80 unique kernels, and why is guided search used?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 1}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2257", "title": "The Triton vs CUDA Tradeoff", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If hand-tuned CUDA reaches 85% of H100 peak, what peak-FLOPS fraction should Triton achieve, and why is there a gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2258", "title": "The Lowering Pass Fusion Loss", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why would SwiGLU break the fusion pass, and at which IR level does the failure occur?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 1}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2259", "title": "The Graph Break Recompilation Storm", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is causing the recompilation storm, and how do you diagnose which operations are triggering graph breaks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2260", "title": "The Cost Model vs Profiling Dilemma", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did the analytical cost model break on the new hardware, and how would you fix it for future GPU generations?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 2}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The H100 has a smaller L2 cache than the A100, causing the analytical model to overestimate cache hits. The fix is to hardcode the new cache sizes.", "The H100 changed microarchitectural features (like TMA and larger L2 cache) that the analytical model ignored. The scalable fix is to use a learned cost model calibrated with profiling data.", "The A100 cost model relies on Tensor Cores, which were replaced by CUDA cores in the H100. The fix is to switch entirely to random search for all compilations.", "The auto-tuner was profiling kernels using CPU memory instead of GPU HBM. The fix is to ensure all tensors are moved to device memory before tracing."], "correct_index": 1}}, {"id": "cloud-2261", "title": "The XLA vs TorchInductor Decision", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the architectural differences between XLA and TorchInductor, and when does each have a structural advantage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2262", "title": "HMMA to FFMA fallback in Tensor Core codegen", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is codegen falling back to FFMA instead of HMMA, and where in the compiler should you fix it?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 2}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2263", "title": "The Auto-Tuning Transfer Problem", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does tuning for batch_size=32, seq_len=2048 fail on dynamic shapes, and how would you cover the shape space without exhaustive tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Auto-tuning fails because it overfits to the weight distribution of the model. To cover the shape space, you should apply regularization during the tuning phase.", "B) Auto-tuning fails because the learning rate is statically compiled. To cover the shape space, dynamically schedule learning rates based on batch size.", "C) Auto-tuning fails because exhaustive search is too computationally expensive to run online. To cover the shape space, you must tune all possible combinations offline and store them.", "D) Auto-tuning fails because optimal tile sizes depend on the ratio of compute to memory traffic, which shifts with shape. To cover the space, partition it into buckets by arithmetic intensity, tune a representative shape per bucket, and dispatch dynamically."], "correct_index": 3}}, {"id": "cloud-2264", "title": "The Custom Op Compiler Co-Design", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why can't the compiler generate efficient code for GLA, and how would you close the gap with FlashAttention-2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2265", "title": "The MLIR Retargetability Boundary", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Where does MLIR retargetability break when moving to MI300X, and what extra optimization work is required?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 3}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2266", "title": "The Compilation Latency Wall in Serving", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate kernel caching, AOT compilation, and background compilation against the 5-second cold-start SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2267", "title": "The New Accelerator Compiler Stack", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the MLIR compiler stack for this message-passing PE accelerator, including dialects, lowering passes, and auto-tuning?", "chain_ids": ["cloud-chain-auto-005-10"], "chain_positions": {"cloud-chain-auto-005-10": 4}, "chain_tiers": {"cloud-chain-auto-005-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2270", "title": "The cuDNN Convolution Dispatch", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does cuDNN select different algorithms for different layers, and what is the primary factor driving this selection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2272", "title": "The Framework Dispatch Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What specifically causes the 65% CPU-side idle time during inference, and how can you fix it?", "chain_ids": ["cloud-chain-auto-005-19"], "chain_positions": {"cloud-chain-auto-005-19": 0}, "chain_tiers": {"cloud-chain-auto-005-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2273", "title": "The Library vs Custom Kernel Decision", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you evaluate the build-vs-wait decision for writing a custom Triton kernel versus waiting for vendor library support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2274", "title": "The Caching Allocator Fragmentation Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does the allocation fail, and what is the role of PyTorch's caching allocator in this failure?", "chain_ids": ["cloud-chain-auto-005-19"], "chain_positions": {"cloud-chain-auto-005-19": 2}, "chain_tiers": {"cloud-chain-auto-005-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2275", "title": "The torch.compile Graph Break Cascade", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is a graph break, why did these patterns cause 47 of them, and how do you eliminate them to recover the speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2276", "title": "The Autograd Overhead in Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where is the extra 14GB coming from, and how do you reclaim it?", "chain_ids": ["cloud-chain-auto-005-19"], "chain_positions": {"cloud-chain-auto-005-19": 1}, "chain_tiers": {"cloud-chain-auto-005-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2277", "title": "The Runtime Selection for Multi-Model Serving", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which inference runtime should you choose for each of the five models, and why can't one runtime serve all five optimally?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2278", "title": "The CUDA Graphs vs Dynamic Shapes Tradeoff", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you preserve most CUDA Graph benefits while handling variable output lengths and continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2279", "title": "High-Fanout Item Feature Retrieval", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the 250ms item feature fetch and bring end-to-end P99 under the 150ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reduce the candidate pool from 1,000 to 200 items to cut down fetch times by 80%, reducing item fetch to 50ms and total latency to 120ms.", "Implement batched retrievals (MGET), add an in-memory L1 cache with an 80% hit rate, and fetch the remaining misses in parallel batches to drop remote fetch wall time to ~2ms.", "Upgrade the Redis-backed feature store to faster NVMe SSDs to reduce the 2.0ms database processing time per item down to 0.1ms.", "Increase the number of concurrent threads from 10 to 100 so each thread only makes 10 sequential calls, reducing fetch latency to 25ms."], "correct_index": 1}}, {"id": "cloud-2280", "title": "The Compression Pipeline Ordering Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Does it matter whether we prune first and then quantize, or quantize first and then prune?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2282", "title": "The Activation Sparsity Mirage in MoE", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does offloading inactive MoE experts to CPU DRAM cause 5x P99 latency in continuous batching at 200 QPS?", "chain_ids": ["cloud-chain-auto-001-20"], "chain_positions": {"cloud-chain-auto-001-20": 0}, "chain_tiers": {"cloud-chain-auto-001-20": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2283", "title": "The Structured Pruning Recovery Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you determine if more data will close the gap or if you pruned too much, and how do you systematically set the ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2285", "title": "The Full-Stack Compression Audit", "topic": "extreme-quantization", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is wrong with the current compression pipeline, and how would you redesign it from first principles?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 3}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2287", "title": "The Fairness Impossibility Tradeoff", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why are demographic parity and equalized odds impossible to satisfy together except in trivial cases?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2288", "title": "The Fairness Monitoring Compute Budget", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 100M DAU and 8 demographic groups, what hourly volume, 30-day audit storage, and compute cost does fairness monitoring require?", "chain_ids": ["cloud-chain-auto-secondary-014-32"], "chain_positions": {"cloud-chain-auto-secondary-014-32": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2290", "title": "The Intersectional Subgroup Explosion", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does naive intersectional subgroup enumeration fail, and what computational and statistical challenges must the audit handle?", "chain_ids": ["cloud-chain-auto-secondary-014-32"], "chain_positions": {"cloud-chain-auto-secondary-014-32": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Naive enumeration is perfectly fine because 100K samples guarantee at least 800 samples per subgroup, providing sufficient statistical power across all 120 intersections.", "The 120 subgroups can be evaluated efficiently using Bonferroni correction without any loss of statistical power or need for more data.", "It fails because enumerating 120 subgroups creates severe statistical underpowering for minority intersections and guarantees false positives due to multiple comparisons without correction.", "The computational overhead of calculating equalized odds 120 times will cause the evaluation pipeline to exceed the maximum runtime of standard CI/CD runners."], "correct_index": 2}}, {"id": "cloud-2291", "title": "The EU AI Act Compliance Pipeline", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What infrastructure is needed for EU AI Act compliance, and which requirement is most expensive to retrofit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2292", "title": "The Bias Drift Silent Failure", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How did fairness degrade silently despite the initial audit passing, and what monitoring system would have caught it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2293", "title": "The End-to-End Fairness Infrastructure", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the Responsible AI infrastructure, specifying key components, data flows, storage requirements, and the hardest systems challenge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2294", "title": "Real-Time Aggregates Latency Tradeoff", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do the on-the-fly and streaming designs compare for the 30-day z-score, and what architecture would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2295", "title": "Fraud Detection Feature Latency Bottleneck", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you reduce P99 latency from 180ms to below the 100ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2296", "title": "High-Fanout Item Feature Retrieval", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the 250ms item feature fetch and bring end-to-end P99 under the 150ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reduce the candidate pool from 1,000 to 200 items to cut down fetch times by 80%, reducing item fetch to 50ms and total latency to 120ms.", "Implement batched retrievals (MGET) to group the 1,000 item IDs into 5 parallel batches, and add an in-memory L1 cache to achieve an 80% hit rate, dropping fetch latency to ~2ms.", "Upgrade the Redis-backed feature store to faster NVMe SSDs to reduce the 2.0ms database processing time per item down to 0.1ms.", "Increase the number of concurrent threads from 10 to 100 so each thread only makes 10 sequential calls, reducing fetch latency to 25ms."], "correct_index": 1}}, {"id": "cloud-2297", "title": "DLRM Iteration Loop Optimization", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the experiment pipeline to get iteration time under 10 hours without exceeding the current 720 A100-hours per week budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2298", "title": "RAG Experiment Cost and Latency Optimization", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the RAG experiment workflow to cut average iteration time below 1 hour and cost below $50 per run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2299", "title": "TPU MXU Padding and Memory Stalls", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is MXU utilization only 14% on the TPU v4 Pod, and what changes would you make to improve throughput?", "chain_ids": ["cloud-chain-auto-secondary-015-11"], "chain_positions": {"cloud-chain-auto-secondary-015-11": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2300", "title": "Hopper FP8 MoE Latency Regression", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did FP8 make MoE decoding slower, and how would you fix the routing and GEMM kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Hopper does not support FP8 for MoE because expert routing requires FP16 precision.", "FP8 runs at double clock speed, causing thermal throttling.", "Skewed token counts cause unaligned FP8 GEMMs and high amax overhead; fix is padding and fusion.", "Memory bandwidth for FP8 scaling factors exceeds savings."], "correct_index": 2}}, {"id": "cloud-2301", "title": "Diagnosing Gradient Checkpointing Bypasses", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 1GB-per-layer memory growth despite checkpointing, and how much memory is being retained?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2302", "title": "MoE Expert Parallelism Decode Bottleneck", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the MoE model failing to achieve expected performance, and how do you redesign the deployment?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 3}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2303", "title": "GQA Prefill Latency Regression", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the prefill regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2306", "title": "The Silent String-to-Hash Collision", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What could cause prediction variance to collapse to 0.05 despite all schema fields being present and non-null?", "chain_ids": ["cloud-chain-auto-secondary-015-27"], "chain_positions": {"cloud-chain-auto-secondary-015-27": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2307", "title": "The Diluted Regional Distribution Shift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why are SEA users seeing hallucinated translations when the global KL drift metric remains below threshold?", "chain_ids": ["cloud-chain-auto-003-06"], "chain_positions": {"cloud-chain-auto-003-06": 3}, "chain_tiers": {"cloud-chain-auto-003-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2308", "title": "GPU Starvation from Cloud Storage", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing low GPU utilization and pegged CPU during ViT training from S3, and how would you fix the input pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2309", "title": "Cross-Region Training Latency", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are training time and egress costs exploding with the 50TB dataset in us-east-1 and H100s in us-west-2, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2310", "title": "Data Skew OOMs in Preprocessing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do only a few Spark executors OOM during the user-item join, and how would you change the join to fix it?", "chain_ids": ["cloud-chain-auto-003-12"], "chain_positions": {"cloud-chain-auto-003-12": 3}, "chain_tiers": {"cloud-chain-auto-003-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2311", "title": "Distributed Storage Metadata Thrashing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does ResNet-50 throughput collapse when scaling from 16 to 128 GPUs on NFS, and how would you redesign storage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2312", "title": "Periodic Latency Spikes in Feature Store", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 5-minute 3000ms latency spikes in the Kafka-Flink-Redis pipeline, and how would you eliminate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2313", "title": "Auto-Tuning Budget vs Dynamic Batching", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you meet the 5ms latency SLA and the 180-minute compilation budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2314", "title": "Evaluating MLIR Fusion for Bandwidth", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "You propose an MLIR lowering pass to fuse these element-wise operations. What is the expected speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2315", "title": "FSDP AllGather PFC Storms at 4K Scale", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the FSDP AllGather tail latency at 4,096 GPUs, and how would you fix the RoCEv2 network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2316", "title": "Optimizing FP8 Formats for LLM Training", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should you map E4M3 and E5M2 across activations, gradients, and weight updates to avoid FP8 training divergence?", "chain_ids": ["cloud-chain-auto-014-11"], "chain_positions": {"cloud-chain-auto-014-11": 0}, "chain_tiers": {"cloud-chain-auto-014-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2317", "title": "Attention Softmax FP16 Overflow", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 70B model produce NaNs beyond 16k context, and what precision change fixes attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2318", "title": "INT8 KV Cache Per-Tensor Quantization Failure", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does per-tensor INT8 KV cache quantization break the 13B LLM, and what quantization scheme would preserve quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2319", "title": "Mixed Precision Optimizer State Underflow", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does casting Adam states and master weights to FP16 cause the 10B model's loss to plateau after 5,000 steps?", "chain_ids": ["cloud-chain-auto-008-16"], "chain_positions": {"cloud-chain-auto-008-16": 2}, "chain_tiers": {"cloud-chain-auto-008-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2320", "title": "FP8 Delayed Scaling Out-of-Bounds in Production", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does delayed scaling with FP8 E4M3 produce NaNs when prompts shift abruptly, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2321", "title": "RoPE Embedding Degradation in BF16", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do retrieval tasks fail beyond position 60,000 with BF16 long-context inference, and how would you fix RoPE computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2322", "title": "Mitigating W8A8 Activation Outliers via Mathematical Migration", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you re-architect the quantization pipeline to preserve accuracy without dropping to 16-bit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2323", "title": "MoE Router Logit Overflow in FP16", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do all tokens suddenly route uniformly in the FP16 MoE, and what precision change should be made to the router?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2324", "title": "BF16 Accumulation Precision Loss in Massive GEMMs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does BF16 accumulation in the 16384 x 16384 projection hurt validation accuracy, and what accumulation precision should be used?", "chain_ids": ["cloud-chain-auto-014-13"], "chain_positions": {"cloud-chain-auto-014-13": 1}, "chain_tiers": {"cloud-chain-auto-014-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2325", "title": "INT4 KV Cache Group Size Architecture", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does per-channel quantization fail for KV caches, and how does grouped quantization fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2326", "title": "Dynamic Loss Scaling in Mixed Precision", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does a static loss scale of 65536 make the FP16 ViT loss NaN on step 1, and what scaling strategy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2327", "title": "Large Vocabulary Cross-Entropy Overflow", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP16 language-model head and cross-entropy cause NaN loss with a 256,000-token vocabulary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2328", "title": "Activation-Aware Weight Quantization (AWQ)", "topic": "extreme-quantization", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does AWQ recover zero-shot accuracy after RTN INT4 fails, without changing the 4-bit data type or inference kernel?", "chain_ids": ["cloud-chain-auto-secondary-011-22"], "chain_positions": {"cloud-chain-auto-secondary-011-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-011-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2329", "title": "FP8 KV Cache Asymmetry: K vs V Degradation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP8 E4M3 work for the V cache but cause a 5% MMLU drop for the K cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2330", "title": "MoE Router Overhead in FP8 Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you eliminate the 15ms/token router casting penalty while preventing FP8 E4M3 router overflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2331", "title": "MoE Interconnect Bottleneck on TPU Pods", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is MFU stuck at 15% for the 1.2T MoE on TPU v5e, and how would you redesign MoE routing and parallelism?", "chain_ids": ["cloud-chain-auto-002-04"], "chain_positions": {"cloud-chain-auto-002-04": 3}, "chain_tiers": {"cloud-chain-auto-002-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2332", "title": "The FP8 Speedup Illusion in LLM Decoding", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP8 only improve batch-1 decoding from 22 to 24 tokens/sec on the H100, and what would increase throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2333", "title": "Systolic Array Tile Padding Collapse", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does throughput collapse when the batch size changes from 256 to 257 on the 256x256 NPU, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-11"], "chain_positions": {"cloud-chain-auto-secondary-015-11": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2334", "title": "FlashAttention SRAM Bank Conflicts", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing the 12.5% SRAM bandwidth utilization in this FlashAttention kernel, and how would you fix it?", "chain_ids": ["cloud-chain-auto-012-08"], "chain_positions": {"cloud-chain-auto-012-08": 4}, "chain_tiers": {"cloud-chain-auto-012-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The SRAM is too small (128MB), causing the kernel to thrash to HBM. Fix it by increasing the SRAM size or reducing the sequence length.", "The head dimension of 128 creates severe SRAM bank conflicts because it is a multiple of the bank stride, serializing parallel load requests. Fix it by padding the inner dimension in shared memory to skew consecutive column accesses across different banks.", "The compute SMs are stalled because FlashAttention is compute-bound by the softmax operation. Fix it by using an approximation of softmax to reduce FLOPs.", "The custom accelerator's clock speed is too low for a sequence length of 4096. Fix it by pipelining the matrix multiplications to hide latency."], "correct_index": 1}}, {"id": "cloud-2335", "title": "Dataflow Compiler Thrashing on Dynamic Shapes", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the static dataflow accelerator show 850ms p99 and 90% idle time for variable-length speech inputs, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2336", "title": "FPGA DSP Exhaustion and LUT fMAX Collapse", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why did adding the 256x256 MLP layer drop FPGA fMAX from 250MHz to 110MHz, and what RTL/HLS change would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2337", "title": "3D Convolution L2 Cache Thrashing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes the 94% L2 miss rate and 8x HBM read amplification in the 256x256x256 3D CNN, and how would you change the layout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2338", "title": "Die-to-Die Interconnect Bottleneck in Chiplet ASICs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the chiplet accelerator run GeLU at 800 TFLOPS but the reduction-heavy GEMM at 120 TFLOPS, and how should GEMM be partitioned?", "chain_ids": ["cloud-chain-auto-006-06"], "chain_positions": {"cloud-chain-auto-006-06": 3}, "chain_tiers": {"cloud-chain-auto-006-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2339", "title": "HBM Thermal Throttling on Zipfian Embeddings", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does throughput fall as HBM stack 0 reaches 95°C while total HBM bandwidth is only 18%, and how would you distribute the load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The total memory bandwidth is saturated by cache evictions, causing the entire HBM to overheat. The load should be distributed by increasing the TPU core clock.", "Highly skewed Zipfian accesses to a few popular embeddings located on Stack 0 cause a localized physical thermal hotspot, triggering hardware throttling despite low overall bandwidth. The load should be distributed by replicating these hot embeddings across all 4 HBM stacks.", "The model has a memory leak that specifically targets HBM Stack 0, causing it to overheat as it stores redundant data. The load should be distributed by flushing the memory periodically.", "The 18% HBM bandwidth utilization is a false metric; the TPU actually processes 1200 GB/s internally, causing Stack 0 to overheat. The load should be distributed by using larger batch sizes."], "correct_index": 1}}, {"id": "cloud-2340", "title": "INT4 Weight-Only Quantization Slowdown on A100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT4 weight-only quantization make the 65B model slower despite shrinking weights, and what quantization/deployment fix would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU lacks native INT4 Tensor Cores, so ALUs must unpack INT4 to FP16, creating an 80ms compute bottleneck; fix by using INT8 natively or pre-packing weights.", "The memory bandwidth drops to 17.5ms, but unaligned accesses add 78ms of penalty; fix by using FP16.", "CPU-to-GPU transfer of the 35GB INT4 weights is bottlenecked by the PCIe Gen4 bus.", "INT4 quantization causes massive activation scale outliers that force a fallback to FP32 kernels."], "correct_index": 0}}, {"id": "cloud-2341", "title": "XLA Compiler Host OOM on Long Sequence Attention", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does compiling 128k FlashAttention OOM the 512GB host CPU despite <2GB device memory, and how would you avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2342", "title": "ZeRO-3 Host Offload PCIe Contention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why are the 8 GPUs only 12% utilized with ZeRO-3 CPU offload despite idle NVLink, and what architecture changes would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2343", "title": "PCIe Switch Oversubscription in KV Paging", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do KV-cache host-to-GPU transfers spike above 600ms with 16 concurrent requests, and how should the allocator/scheduler account for PCIe topology?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 1}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2344", "title": "Hybrid Interconnect MoE Routing", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you map the 2GB-per-GPU MoE All-to-All onto the NVLink-pair plus PCIe-switch topology to reduce the 60% throughput loss?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2345", "title": "Bypassing CPU for Massive Checkpoints", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why do 80GB-per-GPU checkpoints stall through host CPU copies, and what direct distributed-storage path should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NVMe filesystem has insufficient IOPS to handle 640 GB of concurrent writes, requiring an upgrade to a parallel filesystem like Lustre.", "The CPU memory bus and PCIe pathways become saturated because checkpoint data is bounced through host RAM; use a GPUDirect RDMA or storage-direct path matched to the distributed filesystem.", "The PyTorch training loop is Single-Thread-Bound, taking 45.7 seconds (640GB / 14GB/s) to serialize the tensors.", "The dual 200Gbps NICs are bottlenecking the transfer; upgrading to 400Gbps NICs will eliminate the 45-second stall by providing 100 GB/s."], "correct_index": 1}}, {"id": "cloud-2346", "title": "CXL vs HBM in Embedding Tables", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Will adding a CXL 2.0 Type 3 memory module let the 1.5TB embedding service reach 40M lookups/sec, and what should you do instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2347", "title": "RoCEv2 Incast in 3D Parallelism", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the >60ms tail-latency spikes when 64 GPUs send 50MB to one target, and how would you mitigate this RoCEv2 incast?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2348", "title": "NUMA Boundaries in High-Throughput DMA", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is PCIe Rx capped at 11GB/s per GPU with one CPU thread saturated, and how would you fix the host transfer path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2349", "title": "Multi-Tenant Serving: LoRA vs KD", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy should you choose for 100 customized 8B LLM tenants—100 distilled 1B models or one 8B base with LoRA adapters—and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2350", "title": "The Auto-Scaler Fragmentation Deadlock", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are 50 strict 64-GPU jobs stuck despite 4,400 idle GPUs, and why does adding 50 nodes per day not fix the queue?", "chain_ids": ["cloud-chain-auto-005-02"], "chain_positions": {"cloud-chain-auto-005-02": 3}, "chain_tiers": {"cloud-chain-auto-005-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2351", "title": "The Multi-AZ AllReduce Cost Explosion", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did adding 128 nodes in AZ-B cut throughput by 40% and create a massive cross-AZ bill, and how should scheduling change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The new AZ-B nodes have defective networking hardware that drops packets, causing the 40% throughput loss and $170k/week egress cost due to TCP retransmissions. The scheduler should isolate these nodes.", "A topology-agnostic scheduler allowed synchronous Ring AllReduce to span across AZ-A and AZ-B. The slow inter-AZ link bottlenecked the GPUs, and the massive gradient transfers incurred the ~$170k/week egress bill. The scheduler must enforce locality-aware, single-AZ placement.", "The training loop is using FP8 quantization which falls back to CPU computation when spanning multiple AZs, causing the throughput drop and high egress costs. The scheduler should disable FP8 for cross-AZ jobs.", "The 128 new nodes in AZ-B are reading training data from an object store in AZ-A. The cross-AZ data loading is starving the GPUs of batches and generating the $170k/week bill. The scheduler should replicate the data to AZ-B."], "correct_index": 1}}, {"id": "cloud-2352", "title": "Asynchronous Checkpoint Storage Tiering", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect storage and checkpoint data flow for the 16,384-GPU job to sustain 1.6TB/s reads and keep checkpoint pauses under 100ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2353", "title": "Petabyte-Scale Multimodal Streaming", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design ingestion to sustain 2 TB/s throughput without hitting API rate limits, excessive costs, or cache-warming penalties?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2354", "title": "Diagnosing PCIe NUMA Mismatch", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do GPUs 4-7 train at 700 images/s while GPUs 0-3 hit 1500 images/s, and how should the PyTorch dataloader be pinned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2355", "title": "Root Causing NVLink P2P Fallback", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are TP=4 AllGather ops capped near 24GB/s instead of 300GB/s NVLink on the 4-GPU node, and how would you restore P2P?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2356", "title": "Diagnosing RoCEv2 PFC Storms in MoE", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 3-5s network collapses and NCCL watchdog crashes during MoE All-to-All on RoCEv2, and how would you mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2358", "title": "Diagnosing Pageable Memory Bounce Buffers", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 200MB host-to-GPU feature transfers randomly spike from 8ms to 45ms, and what memory allocation pattern would prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe bus is congested by other background tasks. Fix this by upgrading to PCIe Gen5 or reducing the batch size.", "The CUDA driver must dynamically allocate a hidden pinned 'bounce buffer' and perform a host-to-host copy before DMA transfer. Fix this by pre-allocating a persistent ring buffer of pinned memory using `cudaMallocHost`.", "The host CPU is dynamically generating features too slowly, causing GPU starvation. Fix this by moving feature generation to the GPU.", "The `cudaMemcpyAsync` API inherently has high tail latency due to CPU-GPU synchronization overhead. Fix this by switching to synchronous `cudaMemcpy`."], "correct_index": 1}}, {"id": "cloud-2359", "title": "Root Causing Containerized IPC Fallback", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving the 8x GPU DDP job into Kubernetes raise iteration time from 400ms to 1200ms, and how would you fix the NCCL fallback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The container is throttling CPU requests, slowing down the dataloader. Fix this by increasing CPU limits in the Pod spec.", "The container restricts PCIe bandwidth to the GPU. Fix this by adding the `nvidia.com/gpu` resource limit.", "The container restricts the `/dev/shm` shared memory size to 64MB, causing NCCL to fall back from NVLink. Fix this by mounting a larger `tmpfs` volume to `/dev/shm`.", "The Kubernetes network plugin is routing intra-node traffic through the pod overlay network. Fix this by setting `hostNetwork: true`."], "correct_index": 2}}, {"id": "cloud-2360", "title": "Diagnosing PCIe Contention from Checkpointing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does iteration time spike from 2.5s to 18.5s every 1000 steps during async 80GB checkpointing, and how would you reduce the stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2361", "title": "Root Causing RDMA DCQCN Slow Start", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is 70B TP-over-RDMA inference stuck at 45ms/token with <10% link utilization and no drops, and how would you tune the network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2362", "title": "Diagnosing CXL Cache Thrashing", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why do 1M random lookups on the 1TB CXL memory table take 120ms while bandwidth is only 2GB/s, and how would you speed them up?", "chain_ids": ["cloud-chain-auto-008-11"], "chain_positions": {"cloud-chain-auto-008-11": 2}, "chain_tiers": {"cloud-chain-auto-008-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CXL 2.0 module is defective because it is only achieving 2GB/s instead of 32GB/s. The hardware should be RMAd and replaced with a module that has working PCIe lanes.", "The PCIe Gen5 bus is congested by other background traffic, artificially capping the CXL module to 2GB/s. The application should use PCIe Quality of Service (QoS) to prioritize embedding fetches.", "Random sparse lookups cause severe CPU cache thrashing. Because CXL latency is ~250ns, serialized fetches limit throughput. The solution is to use software-managed batching and prefetching to overlap latency and saturate the bandwidth.", "Embedding parallelism is unnecessary because embeddings can be stored on CPU and looked up via PCIe; the sparse lookup pattern means only a few KB per batch needs to transfer from CPU to GPU."], "correct_index": 2}}, {"id": "cloud-2363", "title": "Root Causing H2D Serialization in CUDA Streams", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the GPU idle ~34% of the time despite using cudaMemcpyAsync, and how would you overlap transfer and compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2364", "title": "Diagnosing Multi-Tenant RDMA QoS Interference", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 10ms RDMA fraud-inference requests spike to 150ms during a TCP Hadoop shuffle, and what network QoS configuration would prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2365", "title": "Fat-Tree Oversubscription Tradeoff", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you accept the $15M savings from a 2:1 oversubscribed 400Gbps fat-tree, and what is the expected All-Reduce latency penalty?", "chain_ids": ["cloud-chain-auto-027-06"], "chain_positions": {"cloud-chain-auto-027-06": 3}, "chain_tiers": {"cloud-chain-auto-027-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2366", "title": "Dragonfly Topology for MoE Routing", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 32,000-GPU MoE training, should you choose Dragonfly over a Fat-Tree for 60% fewer links, and what latency risks arise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2367", "title": "ECMP vs Adaptive Routing for Elephant Flows", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you replace static ECMP with packet-level adaptive routing to push RoCEv2 utilization above 60%, and what hardware tradeoff does it create?", "chain_ids": ["cloud-chain-auto-secondary-013-31"], "chain_positions": {"cloud-chain-auto-secondary-013-31": 3}, "chain_tiers": {"cloud-chain-auto-secondary-013-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2368", "title": "RoCEv2 vs InfiniBand Congestion Control", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 8,192 GPUs with frequent 64-to-1 incast, how do InfiniBand and RoCEv2 trade off congestion handling, deadlock risk, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2369", "title": "Rail-Optimized vs Standard Leaf-Spine", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which 8-NIC leaf-spine layout should you choose—same-ToR or rail-optimized—and how do they trade All-Reduce performance against fault tolerance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2370", "title": "RDMA vs RPC for Disaggregated KV-Cache", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For transferring a 15GB KV-cache between prefill and decode nodes, how do gRPC over TCP and GPUDirect RDMA trade latency, CPU load, and complexity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2371", "title": "AEC vs AOC for 800G Leaf Links", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 24,000 800Gbps 1.5-2.5m NIC-to-leaf links, should you use AECs or AOCs, and what are the power tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2372", "title": "PFC Deadlock Recovery Latency Tradeoff", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you choose between 100us and 50ms PFC deadlock timeouts, and what are the performance and stability tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2373", "title": "Scale-Up vs Scale-Out for 256 GPUs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which 256-GPU architecture should you choose for 8-way TP and 32-way PP, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2374", "title": "FEC Latency in Distributed Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you disable KP4 FEC to save 110 ns per hop, or keep it enabled for the 15-hop token path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2375", "title": "3D Torus Bisection Bandwidth Limitation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you run the 4,096-GPU All-Reduce workload on the 3D Torus or migrate to a non-blocking Fat-Tree, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2376", "title": "RDMA Memory Registration Overhead", "topic": "mlops-lifecycle", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the GNN pipeline use dynamic RDMA memory registration or a pre-pinned 100GB GPU pool, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2377", "title": "Cloud-Assisted Wake Word Pipeline", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you split wake-word and intent processing between the MCU and the cloud to stay under the 20mW budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2379", "title": "OTA Delta Updates for Wearables", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect the cloud-to-wearable update pipeline for a 200KB model over 50 kbps BLE to minimize power and SRAM usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2380", "title": "Tiered Inference for Video Doorbells", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you tier PIR, MCU person detection, and cloud Face ID so the 5000mAh doorbell lasts 6 months?", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 3}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2382", "title": "Federated Personalization of Wake Words", "topic": "federated-learning", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you personalize the 50KB keyword model on 256KB earbuds without sending audio or exceeding SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2383", "title": "Edge-Cloud Traffic Camera Bandwidth Optimization", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you split perception and tracking between 5,000 LTE cameras with edge NPUs and the cloud to reduce data cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2384", "title": "Privacy-Preserving Health Wearable FL", "topic": "federated-learning", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect federated sleep-stage training for 1M smart rings without draining the 20mAh battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Upload raw PPG and Temp data continuously over BLE to a phone gateway for centralized cloud training, avoiding on-device compute.", "Use cross-device Federated Learning with edge feature extraction. The ring extracts low-dimensional features, performs local SGD while charging, and syncs lightweight model updates via BLE to the phone.", "Stream the raw 50Hz data directly to the cloud via the ring's built-in LTE connection, bypassing the phone entirely to save BLE power.", "Send the full raw data over BLE in a single batch once a week, buffering the 60 MB of data in the ring's 128KB SRAM."], "correct_index": 1}}, {"id": "cloud-2385", "title": "High-Throughput Streaming Drift Detection", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign drift monitoring for 20,000 QPS of 256-dimensional embeddings without OOMing the sidecar?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2386", "title": "Cost-Aware Shadow Testing Architecture", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the two-tower model without paying for a 100% 7-day shadow deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2387", "title": "Resolving Feature-Model Desynchronization", "topic": "mlops-lifecycle", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you prevent the V2 model rollout from causing Redis cache-miss storms on user_affinity_v2?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2389", "title": "Mitigating Cold Start Cascading Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you redesign autoscaling so 60GB model weights don't make new pods crash-loop during the 3,000 QPS spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2391", "title": "Drift Detection with Extreme Label Delay", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you detect the new fraud ring before 45-day chargeback labels mature?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2392", "title": "Decoupling Storage in High-Frequency Deployments", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign model deployment so 40 daily updates don't push 15GB Docker images or slow scale-outs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2393", "title": "Production Debugging of Tensor Memory Leaks", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you stop the PyTorch serving pods from OOMKilling every ~12 days while debugging the underlying memory leak?", "chain_ids": ["cloud-chain-auto-008-07"], "chain_positions": {"cloud-chain-auto-008-07": 2}, "chain_tiers": {"cloud-chain-auto-008-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2394", "title": "Eliminating Non-Determinism in ML CI/CD", "topic": "mlops-lifecycle", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the golden-logit GPU integration test flaky at 1e-6 tolerance, and how would you stabilize it?", "validated": true, "math_verified": true, "human_reviewed": {"status": "verified", "by": "expert", "date": "2026-04-28"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2395", "title": "Power-Capped Rack Density Tradeoff", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which rack configuration maximizes total tokens/sec under the 40kW limit, and what tradeoff does it introduce?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 5}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2396", "title": "Carbon-Aware Multi-Region Routing", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the weekly 10,000 GPU-hour job run in US-East or EU-North to minimize carbon, and what is the cost tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2397", "title": "Minimizing Serving Cost per Token", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which instance strategy minimizes TCO per million Mixtral tokens for the 10,000 tok/sec SLA, A100 or H100?", "chain_ids": ["cloud-chain-auto-secondary-009-16"], "chain_positions": {"cloud-chain-auto-secondary-009-16": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2398", "title": "Federated Learning Edge-Cloud Energy", "topic": "federated-learning", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do Configurations A and B compare on edge energy and cloud cost, and can either meet the 2% battery limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Config A drains 13.9% battery and costs $200; Config B drains 7.8% and costs $40. Neither meets the strict 2% battery limit.", "Config A meets the 2% battery limit because it only runs 1 local epoch, whereas Config B drains 7.8% battery due to the high compute cost of 10 epochs.", "Config B meets the 2% battery limit because it reduces communication rounds from 100 to 20, saving massive amounts of Tx/Rx energy.", "Both configurations meet the 2% battery limit, but Config B is preferred because it reduces cloud costs from $200 to $40."], "correct_index": 0}}, {"id": "cloud-2399", "title": "Datacenter Liquid Cooling TCO", "topic": "thermal-management", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 20MW datacenter, should you choose air or direct-to-chip liquid cooling over 3 years, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-43"], "chain_positions": {"cloud-chain-auto-secondary-015-43": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2400", "title": "API Defenses Against Model Extraction", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you defend the 70B LLM API against model extraction while keeping TTFT under 150ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2401", "title": "TEEs for Cloud Medical Inference", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you deploy the 5GB ViT-L with TEEs for HIPAA data-in-use encryption while supporting 200 QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2402", "title": "Ghost Clipping for 24h DP-SGD on 500M Records", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the training plan trade off memory, privacy, and throughput to train the 500M-record embedding model at epsilon ≤ 1.5 in under 24 hours without OOMing A100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2403", "title": "Defending Against Streaming Poisoning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you defend the 30-minute fine-tuning pipeline against data poisoning in 50,000 phishing reports/hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2404", "title": "Secure ML Model Supply Chain", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you secure PyTorch model loading against Pickle RCE without exceeding the 15s P99 pod startup SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2405", "title": "Pipelined Guardrails in Compound Systems", "topic": "compound-ai-systems", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you orchestrate retrieval, generation, and the 300ms guardrail to meet the 1000ms P90 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Implement a Pipelined Guardrail architecture by chunking the LLM output every 50 tokens. The guardrail evaluates chunks asynchronously, overlapping with the generation of subsequent chunks, which reduces the critical path to 950ms and meets the SLA.", "Stream the LLM response directly to the client as it is generated, bypassing the guardrail for the first 100 tokens to ensure the Time-To-First-Token (TTFT) meets the 1000ms SLA.", "Reduce the LLM generation target from 150 tokens to 100 tokens to save 200ms, and run the 300ms guardrail sequentially at the end to achieve exactly 950ms.", "Deploy the Guardrail model onto the same GPU as the main LLM to eliminate network transfer latency, saving exactly 200ms to hit the 950ms target."], "correct_index": 0}}, {"id": "cloud-2406", "title": "Activation Outliers in Large Models", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the W8A8 degradation, and how do you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2407", "title": "FP8 MoE Routing Collapse", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the cause of this sudden routing degeneration, and how do you fix the precision configuration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2408", "title": "KV Cache Quantization Sinks", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes this long-context degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2409", "title": "INT4 QAT STE Boundary Oscillation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the mechanistic cause of this training instability during INT4 QAT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2410", "title": "Calibration Overfitting in GPTQ", "topic": "extreme-quantization", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did Java performance specifically degrade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2411", "title": "Unfused Dequantization Bottleneck", "topic": "extreme-quantization", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the architectural cause of this missing speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2412", "title": "Batch Size vs Quantization Pareto Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes this inversion in the performance Pareto curve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2413", "title": "Batch-1 TPU MXU Underutilization", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is TPU v4 slower than A100 at batch size 1, and how would you co-design the serving stack to fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-11"], "chain_positions": {"cloud-chain-auto-secondary-015-11": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2414", "title": "MoE Routing Bandwidth Starvation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 128-expert MoE only 12% utilized, and what hardware-aware routing or placement change would restore compute efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2415", "title": "FPGA Spatial Pipeline Mapping", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did standard HLS miss the 10us SLA, and how would you map the 5-layer MLP onto the FPGA to meet it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2416", "title": "Choosing FP8 E4M3 vs E5M2 on H100 for Attention Outliers", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What FP8 formats should you use for weights and activations on H100, and why is E4M3 failing in attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2417", "title": "Custom ASIC SRAM Tiling", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the compiler map self-attention for sequence length 4096 onto the 16MB SRAM to avoid HBM spilling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2419", "title": "Unstructured vs Structured Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 60% unstructured pruning fail to speed up ResNet-50 on A100, and what sparsity pattern would actually accelerate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2420", "title": "Arithmetic Intensity & Cache Thrashing", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does co-locating the embedding model under MPS spike MLP latency, and should you use MPS or MIG?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2421", "title": "H100 FP8 Migration Throughput Cliff", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does moving from 4x A100 INT8 to 2x H100 FP8 barely increase QPS, and what must change in the KV cache path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2422", "title": "W8A8 PTQ Extreme Activation Outliers", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does naive W8A8 PTQ break the 130B model, and how would you handle the 0.1% activation outlier channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2423", "title": "Long-Context INT4 KV Cache Degradation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did INT4 KV cache quantization hurt needle-in-haystack accuracy, and how would you preserve long-context retrieval?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2424", "title": "CTR Embedding QAT Pipeline Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did FakeQuant QAT slow embedding training by 4x, and how would you redesign QAT to meet the daily retraining SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2425", "title": "Multi-Tenant LoRA Base Model Calibration", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do code LoRA adapters lose pass@1 after INT8 base quantization while chat is fine, and how should you calibrate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2426", "title": "W4A16 Batch-Size Latency Inversion", "topic": "extreme-quantization", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the GPTQ INT4 model faster at batch size 1 but slower than FP16 at batch size 128?", "chain_ids": ["cloud-chain-auto-secondary-011-24"], "chain_positions": {"cloud-chain-auto-secondary-011-24": 1}, "chain_tiers": {"cloud-chain-auto-secondary-011-24": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2427", "title": "FP8 Distributed Training Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did all-E4M3 FP8 training produce zero router gradients and NaNs, and which FP8 formats should be used for forward and backward?", "chain_ids": ["cloud-chain-auto-014-11"], "chain_positions": {"cloud-chain-auto-014-11": 1}, "chain_tiers": {"cloud-chain-auto-014-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2428", "title": "Diffusion Model INT8 PTQ Artifacts", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why does INT8 PTQ cause SDXL banding despite target-domain calibration, and how should activation scales be calibrated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2429", "title": "The FP8 Autoregressive Decoding Fallacy", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did decoding not scale directly with the 2x compute increase, and what explains the ~1.8x speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2430", "title": "The A10G vs V100 CNN Regression", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did MobileNetV3 inference get 50% slower on an A10G despite 92% higher FP16 TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2431", "title": "The DLRM Vertical Fusion Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did fusing embedding lookup with the first dense MLP layer double DLRM step latency on TPU v4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2432", "title": "The RoPE Recomputation Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does recomputing RoPE sin/cos on the fly make the kernel 4x faster despite 400% more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2433", "title": "The GQA Prefill Illusion", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does GQA improve decoding throughput 3x but reduce end-to-end latency by only 2% for 50k-token prompts and 10 outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2434", "title": "The CSR Sparsity Performance Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing dense GEMMs with CSR SpMM make the 70%-sparse BERT-Large 5x slower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2435", "title": "The Over-Batched T4 Throughput Plateau", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does increasing BERT-base batch size from 32 to 256 flatline throughput while P99 latency spikes on T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2436", "title": "LLM Decoding Bandwidth Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is 100 tokens/sec at batch size 1 impossible for the 30B FP16 model on one A100 despite needing only 6 TFLOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2437", "title": "The Long-Context Attention Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Would upgrading to H100 deliver a 3x speedup for 64k-token MHA prefill, or what should you change instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2438", "title": "The Quantization Throughput Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does quantizing the DLRM MLPs to INT8 leave throughput flat at about 5,100 QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2439", "title": "Graph Neural Network Hardware Selection", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which hardware should you choose for the GraphSAGE inference workload, A100 or TPU v4, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2440", "title": "Continuous Batching Arithmetic Shift", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Will continuous batching raise decode-step latency to 280ms, and what can still affect request latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2441", "title": "Tensor vs Pipeline Parallelism Roofline", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "At batch size 1, should the 70B model use PP=8 or TP=8 on the 8-GPU node, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2442", "title": "The MoE Memory Bandwidth Tax", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 8x40B MoE with the same active FLOPs as the 40B dense model drop from 500 to 120 tokens/sec?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 1}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2443", "title": "The Unstructured Pruning Illusion", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 50% unstructured pruning halve FLOPs but leave 14B batch-1 decoding latency stuck at 31ms on A10G?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2444", "title": "Batched Prefill Throughput Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does batching 16 simultaneous 2048-token prefill requests barely increase tokens/sec on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2445", "title": "FlashAttention Head Dimension Spillage", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does doubling the attention head dimension from 128 to 256 slow FlashAttention-2 training by 45% despite identical FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2446", "title": "The Activation Checkpointing Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does full activation checkpointing increase 30B fine-tuning step latency by 75% instead of the expected 33%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2447", "title": "MoE Routing at High Batch Size", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 16x10B top-1 MoE 30% slower than the dense 30B model at batch size 128 despite 3x fewer active FLOPs?", "chain_ids": ["cloud-chain-auto-006-03"], "chain_positions": {"cloud-chain-auto-006-03": 2}, "chain_tiers": {"cloud-chain-auto-006-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2448", "title": "The W16A8 Quantization Trap", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does W16A8 quantization fail to reduce batch-1 TPOT for the 70B model on H100, and what quantization would help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2449", "title": "The MQA Prefill Disappointment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does replacing MHA with MQA fail to reduce prefill latency for a single 16K-token prompt on the L4 GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2450", "title": "The Embedding Layer Bandwidth Illusion", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the embedding lookup limited to about 22.3 GB/s on H100 despite peak bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2451", "title": "Speculative Decoding Draft Model Sizing", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which draft model should you choose for speculative decoding, the 1.5B or 7B, to meet the 20ms TPOT budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2452", "title": "Prefill and Decode Cluster Disaggregation", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Should you disaggregate prefill and decode over 100Gbps RoCE, and what latency tradeoff must you evaluate?", "chain_ids": ["cloud-chain-auto-006-02"], "chain_positions": {"cloud-chain-auto-006-02": 2}, "chain_tiers": {"cloud-chain-auto-006-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2453", "title": "Paged Attention Block Size Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a 256-token Paged Attention block size cause OOM at batch size 32, and what block-size tradeoff should you make?", "chain_ids": ["cloud-chain-auto-008-02"], "chain_positions": {"cloud-chain-auto-008-02": 2}, "chain_tiers": {"cloud-chain-auto-008-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2454", "title": "Chunked Prefill for Latency Jitter", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does chunked prefill reduce P99 TPOT spikes from the 4000-token prompt, and what tradeoff does it introduce?", "chain_ids": ["cloud-chain-auto-021-10"], "chain_positions": {"cloud-chain-auto-021-10": 2}, "chain_tiers": {"cloud-chain-auto-021-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2455", "title": "KV Cache PCIe Offloading", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design CPU KV-cache offload for 100K-token sessions, and what latency tradeoff must you manage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2456", "title": "Multi-LoRA Continuous Batching", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can you batch requests across 100 different LoRA adapters on one 13B base model without destroying GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2457", "title": "System-Prompt Prefix Caching", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the benefits and costs of using RadixAttention prefix caching for the shared 1500-token system prompt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2458", "title": "The Edge Fleet Compilation Dilemma", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you move edge model compilation to the cloud without 10-minute device downtime or 15% deployment failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2459", "title": "The NPU Fallback Latency Cliff", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does YOLOv8-nano jump to 185ms and 85% CPU utilization despite targeting the same edge NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2460", "title": "The Hybrid Cascade Bandwidth Trap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does end-to-end latency spike above 2500ms during the festival despite cloud GPU autoscaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2461", "title": "The Unfused Dequantization Tax", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the W4A16 INT4 model run at only 3 tokens/sec on the mobile GPU, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2462", "title": "The Cloud-to-Edge Calibration Shift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the INT8 model fail at night while the FP32 model works, and how would you fix the quantization pipeline?", "chain_ids": ["cloud-chain-auto-014-14"], "chain_positions": {"cloud-chain-auto-014-14": 0}, "chain_tiers": {"cloud-chain-auto-014-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2463", "title": "The NPU Context-Switching Tax", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the three-model edge pipeline 80ms instead of 30ms, and how would you restructure serving to improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2464", "title": "The Asymmetric Heterogeneous Offload", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is forcing the whole ViT onto the weaker mobile GPU faster than splitting MatMuls to the NPU and LayerNorm/Softmax to the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2465", "title": "Cloud-Edge Cascade for Retail Analytics", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect a cloud-edge cascade to minimize 3-year TCO while maintaining >95% recall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2466", "title": "Cost-Aware Hybrid LLM Routing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you route summarization between on-device Llama-3-8B and cloud models to cut cost while keeping latency under 2s for all users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2467", "title": "Optimizing Edge CI/CD Device Farm Costs", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the device-farm deployment pipeline to cut testing cost by 80% without letting NPU-crashing models ship?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2468", "title": "Cellular Cost Optimization via Edge LoRA", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign the daily POS model update system to reduce cellular data costs by more than 90% while preserving offline inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2469", "title": "Edge-Triggered Cloud Telemetry", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you selectively ingest rare pedestrian edge cases from 10,000 robots within a $500/day 5G budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2470", "title": "Hybrid ASR Compute Offloading", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you add on-device processing to cut smart-speaker cloud costs by at least 50% without increasing latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2471", "title": "On-Device Ad Ranking for Cloud Savings", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign ad ranking to cut cloud DLRM costs by 75%, keep latency under 100ms, and use local app history?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2472", "title": "Cloud Fallback Minimization via AOT Compilation", "topic": "mlops-lifecycle", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you architect the deployment pipeline to reduce cloud fallback caused by Android NNAPI/CPU execution failures while keeping on-device latency under the 1s timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2473", "title": "The NPU Fallback Ping-Pong", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the structurally similar Model B perform 10x worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2474", "title": "The Hybrid Cascade False Positive Storm", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the cloud ingest bandwidth spike by 10x at dusk, crashing the gateway?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2475", "title": "The Edge LLM Memory Wall", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the limiting factor preventing the Edge NPU from achieving higher tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2476", "title": "The Asymmetric Quantization Penalty", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did switching from symmetric to asymmetric INT8 quantization raise edge latency from 15ms to 60ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2477", "title": "The Edge Gateway Batching Paradox", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does strict batch size 8 fail the 60ms safety SLA despite higher GPU throughput, and what batching policy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2478", "title": "The Self-Inflicted OTA DDoS", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the midnight OTA rollout of a 2GB model saturate the retail SD-WAN, and how would you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2479", "title": "The Thermal Throttling Ticking Clock", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the framerate drop to 12 fps after exactly 90 seconds of tracking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2480", "title": "The SRAM Spilling Bottleneck", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you modify the compiler to hit 5ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2481", "title": "Edge-Cloud Hybrid Video Break-Even", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which option has the lower 3-year TCO for 5,000 traffic cameras, and what accuracy/deployment tradeoffs must you evaluate?", "chain_ids": ["cloud-chain-auto-006-04"], "chain_positions": {"cloud-chain-auto-006-04": 2}, "chain_tiers": {"cloud-chain-auto-006-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2482", "title": "Fleet-Wide Heterogeneous Deployment", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you redesign the mobile BERT deployment pipeline to reduce P99 latency and battery drain across 15 Android SoCs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2483", "title": "The NPU Fallback Latency Trap", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the ~330ms P99 latency caused by unsupported INT8 GeLU on the target NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2484", "title": "Dynamic Edge-Cloud Serving Threshold", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you set up a local-cloud cascade to meet the SLA while minimizing cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2485", "title": "Depthwise Quantization Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does PTQ drop MobileNetV2 mAP from 42 to 18, and how would you preserve INT8 accuracy on the edge?", "chain_ids": ["cloud-chain-auto-014-14"], "chain_positions": {"cloud-chain-auto-014-14": 1}, "chain_tiers": {"cloud-chain-auto-014-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2486", "title": "Multi-Tenant Edge Accelerator Allocation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you schedule Model A and Model B on one Edge TPU to meet 30fps and 10fps requirements without 80ms latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2487", "title": "Bandwidth-Constrained OTA Pipelines", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the weekly OTA deployment pipeline to reduce cellular data costs by over 90% without degrading model performance?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2488", "title": "Cloud-Edge Federated Learning Memory Limits", "topic": "federated-learning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign federated learning for 5 million Cortex-M33 sensors so training fits in 256KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2489", "title": "Power-Aware Cloud-Edge Fallback", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What maximum BLE fallback rate should the cloud enforce when adapting the confidence threshold so the wearable can meet 14-day battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2490", "title": "Async FL for Intermittent Solar Devices", "topic": "federated-learning", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign FedAvg for 100,000 solar sensors that wake for only 5 seconds per hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2491", "title": "Smartwatch Federated Learning with INT8 Adapters", "topic": "federated-learning", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you enable federated learning on 2 million INT8 smartwatches without exceeding 256KB SRAM or BLE bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2492", "title": "OTA Model Patching for Flash Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you deliver updates without needing dual-bank Flash?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2493", "title": "10,000-Sensor Predictive Maintenance Under a 50MB/s WiFi Cap", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you partition predictive-maintenance inference so 10,000 Cortex-M4 sensors stay under the 50MB/s WiFi cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2494", "title": "Wildlife Collar FL Under a 36 J/day Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the collar FL update policy to stay within the strict 36 Joules/day energy budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2495", "title": "Federated Learning Power and Comm Budget", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How many FL rounds per day can each watch afford, and how should the cloud orchestrate participation under the 1% battery limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2496", "title": "Cloud NAS Hardware-in-the-Loop Optimization", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the NAS model draw 15mW despite fitting in SRAM, and how should the NAS pipeline change to meet the <5mW target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2497", "title": "NB-IoT OTA Updates for 150KB INT8 TFLite Micro Models", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you update the 150KB INT8 model weekly over NB-IoT without violating the 5-year battery-life constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2498", "title": "Asynchronous FL with Edge Stragglers", "topic": "federated-learning", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What FL strategy should replace synchronous FedAvg when 85% of solar sensors drop during the 5-minute training window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2499", "title": "Edge-Cloud Partitioning for Acoustic Monitoring", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you balance local inference and cloud fallback for 1000 bird-call events/day to save energy while preserving rare-bird accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2500", "title": "Memory Bottleneck in Edge Personalization", "topic": "federated-learning", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign on-device personalization so the smart ring can train without exceeding its 256KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2501", "title": "Cloud Drift Detection via Edge Proxies", "topic": "federated-learning", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you monitor concept drift across 1M vibration sensors over 10kbps LoRa without uploading raw 16kHz data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2502", "title": "Mixed-Precision Federated Aggregation", "topic": "federated-learning", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you orchestrate FL for INT8 Cortex-M0+ devices without FP32 support while avoiding INT8 training divergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2503", "title": "Carbon-Aware Scheduling Tradeoffs", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you pause RLHF training for 6 high-carbon hours, or use another strategy to reduce carbon, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2504", "title": "LLM Serving Batch Size Energy", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you meet the 50ms TTFT SLA without destroying cost per token for Llama-3-70B serving?", "chain_ids": ["cloud-chain-auto-secondary-009-16"], "chain_positions": {"cloud-chain-auto-secondary-009-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2505", "title": "Overprovisioning Under Power Caps", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Under a 10MW cap, would you deploy 10,000 GPUs at 1000W or 14,000 GPUs power-capped to 700W, and what is the TCO trade-off?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 4}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2506", "title": "Liquid Cooling Retrofit ROI", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does the $8M D2C liquid-cooling retrofit pay off for a 5MW datacenter, and what operational factors affect the decision?", "chain_ids": ["cloud-chain-auto-secondary-015-43"], "chain_positions": {"cloud-chain-auto-secondary-015-43": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2507", "title": "Model Compression Carbon ROI", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "When does the INT8 distillation project break even on carbon versus continuing FP16 inference for 5M daily requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2508", "title": "Cloud vs Edge FL Energy Tradeoff", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is moving personalization training from 500 GPU-hours to FL across 500,000 smartphones greener, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2509", "title": "15kW Rack Scheduling for Mixed Inference and MatMul Workloads", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule memory-bound inference and compute-bound MatMul workloads to maximize 15kW rack utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2510", "title": "Cloud Through Silicon Vias L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can't traditional wire-bonding between stacked DRAM dies achieve HBM3's bandwidth density?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2511", "title": "HBM3 vs DDR5 Bandwidth Comparison", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many DDR5 channels would be required to match the total memory bandwidth of the HBM3 design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2512", "title": "Cloud Through Silicon Vias L5 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you evaluate the cost-performance trade-off between the 6-stack and 4-stack HBM3 accelerators for serving a 70B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2513", "title": "Cloud Top Of Rack Switch L2 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why do top-of-rack switches exist, and what failure-domain implications do they create for distributed training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2514", "title": "Cloud Top Of Rack Switch L3 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum cross-rack AllReduce bandwidth per GPU with 64 GPUs sharing a 400Gbps ToR uplink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2515", "title": "1024-GPU Training Restart vs 64-GPU Rack Repair", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "After losing one 64-GPU rack, should you restart on 960 GPUs or wait 30 minutes for repair, and what is the break-even time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2516", "title": "TorchDynamo Bytecode-Level Capture vs. Operator-Level Tracing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does TorchDynamo's Python-bytecode-level capture matter, and how does it handle dynamic control flow?", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 0}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2517", "title": "TorchDynamo Speedup and Graph Break Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the speedup achieved, and estimate the potential speedup if graph breaks were eliminated so the entire model could be compiled?", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 1}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2518", "title": "Torch Compile Graph Break Refactor vs 30 Percent More GPUs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you spend 2 weeks eliminating graph breaks or buy 30% more GPUs for the 14-day, 64-A100 training run?", "chain_ids": ["cloud-chain-auto-005-07"], "chain_positions": {"cloud-chain-auto-005-07": 3}, "chain_tiers": {"cloud-chain-auto-005-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2519", "title": "Cloud Train Serve Split L2 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the train-serve split architecture, and why does it work for deploying a cloud-trained 7B model on edge devices?", "chain_ids": ["cloud-chain-auto-001-01"], "chain_positions": {"cloud-chain-auto-001-01": 0}, "chain_tiers": {"cloud-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2520", "title": "Cloud Train Serve Split L3 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the estimated FP16 and INT8 inference memory footprints and throughputs for the 3B model?", "chain_ids": ["cloud-chain-auto-001-01"], "chain_positions": {"cloud-chain-auto-001-01": 1}, "chain_tiers": {"cloud-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2521", "title": "Cloud vs. Edge Inference Deployment Trade-offs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "When should you choose train-cloud/serve-edge over cloud-only inference despite the added deployment complexity?", "chain_ids": ["cloud-chain-auto-001-01"], "chain_positions": {"cloud-chain-auto-001-01": 2}, "chain_tiers": {"cloud-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2522", "title": "Cloud Transformation Lineage L2 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does transformation lineage prevent undocumented training-data pipeline changes from causing hard-to-debug regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2523", "title": "Cloud Transformation Lineage L3 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage and latency overhead would lineage metadata add to an 8-step pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2524", "title": "Cloud Transformation Lineage L5 0", "topic": "mlops-lifecycle", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use dataset-level or record-level lineage for 100M records across 8 steps, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2525", "title": "Transient NaN Failures in 512-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What are transient failures, and why do they become routine in a 512-GPU training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2526", "title": "Cloud Transient Failures L3 0", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many transient failures should you expect over 30 days on 1024 GPUs, and what is the chance of zero failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2527", "title": "Cloud Transient Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compare redundant computation, gradient monitoring with rollback, and validation checks for transient-failure handling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2528", "title": "Cloud Transient Loads L2 0", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are transient power loads, and why is a 2MW, 500ms AllReduce spike dangerous for the datacenter?", "chain_ids": ["cloud-chain-auto-015-03"], "chain_positions": {"cloud-chain-auto-015-03": 0}, "chain_tiers": {"cloud-chain-auto-015-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2529", "title": "Cloud Transient Loads L3 0", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What transient power swing and energy buffer are needed when 4,096 H100s jump from 40% to 95% utilization?", "chain_ids": ["cloud-chain-auto-015-03"], "chain_positions": {"cloud-chain-auto-015-03": 1}, "chain_tiers": {"cloud-chain-auto-015-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2530", "title": "Cloud Transient Loads L5 0", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which transient-power mitigation—buffering, staggered AllReduce, or 80% power caps—would you choose, and why?", "chain_ids": ["cloud-chain-auto-015-03"], "chain_positions": {"cloud-chain-auto-015-03": 2}, "chain_tiers": {"cloud-chain-auto-015-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2531", "title": "Ultra Ethernet vs RoCE for AI Collectives", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What makes Ultra Ethernet different from standard RoCE, and why does that matter for AI collective workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2532", "title": "Cloud Ultra Ethernet L3 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What effective bandwidth should you expect for 32 AllReduce flows over 4 paths with ECMP versus packet spraying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2533", "title": "Cloud Ultra Ethernet L5 0", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 2048-GPU, 60-day 175B GPT training run, would you choose InfiniBand NDR or Ultra Ethernet, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2534", "title": "Why GPUs Outperform CPUs on Neural Network Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why are GPUs faster than CPUs for neural network inference despite lower clock speeds, and what role do SIMD vector operations play?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2535", "title": "ReLU Roofline Analysis on AVX-512 CPU vs H100 GPU", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long would ReLU on a 10M-element tensor take on an AVX-512 CPU (3 GHz, 16-wide, 50 GB/s memory bandwidth) vs. an H100 GPU (3.35 TB/s), and is it compute- or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2536", "title": "Diagnosing Branch Divergence in Custom Activation Kernels", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the custom if-else activation kernel 5x slower than ReLU on GPU, and what fixes would you evaluate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2537", "title": "Cloud Wafer Scale Engine L2 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a wafer-scale chip differ from a multi-GPU setup, and what communication advantage does it provide?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 0}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2538", "title": "WSE-2 13B FP16 SRAM Fit and HBM Decode Speedup", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does a 13B FP16 model fit in WSE-2's 40 GB SRAM, and how much faster is SRAM-speed access than HBM for decode?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 1}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2539", "title": "Choose GPUs vs Wafer-Scale Engine for 13B Model Serving", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 10M daily 200-token requests on a 13B model, should the startup choose the GPUs or the WSE, and why?", "chain_ids": ["cloud-chain-auto-006-08"], "chain_positions": {"cloud-chain-auto-006-08": 3}, "chain_tiers": {"cloud-chain-auto-006-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2540", "title": "Warm Restart After Single-GPU Failure in 256-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is warm restart preferable to cold-restarting the 256-GPU job after GPU #47 fails, and how does it work?", "chain_ids": ["cloud-chain-auto-004-06"], "chain_positions": {"cloud-chain-auto-004-06": 0}, "chain_tiers": {"cloud-chain-auto-004-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2541", "title": "Cloud Warm vs Cold Restart Downtime Analysis", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Over 14 days, how much downtime should the 1024-GPU job expect with cold restarts versus warm restarts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2542", "title": "Evaluate Hot Spares vs Cold Restarts for 2048-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 2048-GPU cluster, is maintaining 16 hot spares (0.8% overhead) worth the cost compared with cold restarts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2543", "title": "Cloud Warp Divergence L2 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is warp divergence, and why would routing tokens to different MoE experts within one warp cause a 3x slowdown?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2544", "title": "Cloud Warp Divergence L3 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 25% short-path tokens and 75% long-path tokens randomly assigned to 32-thread warps, what is the expected warp time and efficiency loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2545", "title": "Cloud Warp Divergence L5 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate sorting tokens, block-level branching, and padding as fixes for warp divergence in the sparse attention kernel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2546", "title": "Weight-Stationary vs Output-Stationary Accelerator Dataflow", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is weight-stationary dataflow, when is it advantageous, and how does it differ from output-stationary dataflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2547", "title": "Weight-Stationary Accelerator Memory Bandwidth for FP16 Matrix", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 2048x2048 FP16 matrix on 256 PEs with 2 KB SRAM each, how many weight tiles are needed and what is the weight traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2548", "title": "Cloud Weight Stationary L5 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which dataflow (weight-stationary or output-stationary) would you choose for 70B LLM batch-1 decode versus ResNet-50 batch-256 inference, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2549", "title": "Optimizing Checkpoint Frequency with the Young-Daly Formula", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why checkpoint every 30 minutes instead of every 5 minutes, and what trade-off does the Young-Daly formula optimize?", "chain_ids": ["cloud-chain-auto-004-01"], "chain_positions": {"cloud-chain-auto-004-01": 0}, "chain_tiers": {"cloud-chain-auto-004-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2550", "title": "Optimal Checkpoint Interval using the Young-Daly Formula", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using Young-Daly with 8-hour MTBF and 3-minute checkpoints, what is the optimal interval and expected training efficiency?", "chain_ids": ["cloud-chain-auto-004-01"], "chain_positions": {"cloud-chain-auto-004-01": 1}, "chain_tiers": {"cloud-chain-auto-004-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2551", "title": "Young-Daly Formula Optimization for Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which investment recovers more training efficiency, reducing checkpoint time to 1 minute for 50K, or doubling MTBF to 8 hours for 100K?", "chain_ids": ["cloud-chain-auto-004-01"], "chain_positions": {"cloud-chain-auto-004-01": 2}, "chain_tiers": {"cloud-chain-auto-004-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2552", "title": "Cloud Zero Copy Serialization L2 0", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would zero-copy serialization reduce the 8 ms JSON serialization cost and improve the 15 ms P99 latency?", "chain_ids": ["cloud-chain-auto-011-01"], "chain_positions": {"cloud-chain-auto-011-01": 0}, "chain_tiers": {"cloud-chain-auto-011-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2554", "title": "Cloud Zero Copy Serialization", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a latency-sensitive ML serving API, how would you choose among FlatBuffers, Protobuf, and JSON?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2555", "title": "Cloud Zero Optimizations L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does ZeRO reduce the memory waste of standard data parallelism, and what do Stages 1, 2, and 3 shard?", "chain_ids": ["cloud-chain-auto-008-03"], "chain_positions": {"cloud-chain-auto-008-03": 0}, "chain_tiers": {"cloud-chain-auto-008-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2556", "title": "Cloud Zero Optimizations L3 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 30B AdamW model on 8 GPUs, what is per-GPU memory under ZeRO Stages 1, 2, and 3, and which stage is the minimum needed to fit?", "chain_ids": ["cloud-chain-auto-008-03"], "chain_positions": {"cloud-chain-auto-008-03": 1}, "chain_tiers": {"cloud-chain-auto-008-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2557", "title": "ZeRO-2 vs ZeRO-3 Memory and Communication Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Given ZeRO-3 adds 50% communication but may enable 2x larger micro-batches, is it worth switching from ZeRO-2?", "chain_ids": ["cloud-chain-auto-008-03"], "chain_positions": {"cloud-chain-auto-008-03": 2}, "chain_tiers": {"cloud-chain-auto-008-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2558", "title": "Cloud Fully Sharded Data Parallel L2 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does FSDP differ from PyTorch DataParallel and DDP, and why does it enable training larger models?", "chain_ids": ["cloud-chain-auto-013-09"], "chain_positions": {"cloud-chain-auto-013-09": 0}, "chain_tiers": {"cloud-chain-auto-013-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2560", "title": "Choose Hybrid Parallelism for 30B Model on 64 GPUs", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy minimizes inter-node communication for the 30B model: full FSDP, node-local FSDP plus DDP, or TP plus FSDP?", "chain_ids": ["cloud-chain-auto-013-09"], "chain_positions": {"cloud-chain-auto-013-09": 3}, "chain_tiers": {"cloud-chain-auto-013-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2561", "title": "Cloud Learning Rate Scheduling L2 0", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is learning rate scheduling necessary when a fixed 1e-3 LR causes LLM loss to plateau after 20% of training?", "chain_ids": ["cloud-chain-auto-secondary-016-02"], "chain_positions": {"cloud-chain-auto-secondary-016-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2562", "title": "Learning Rate and Warmup for 8x Batch Size Scaling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the linear scaling rule, what peak learning rate and warmup steps should you use if the original used LR=3e-4 with 1000 warmup steps?", "chain_ids": ["cloud-chain-auto-secondary-016-02"], "chain_positions": {"cloud-chain-auto-secondary-016-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2563", "title": "Decide Whether to Restart Flattened LLM Pretraining Loss", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 70% through a 30-day LLM pretraining run with flattened loss, should you restart, apply an LR warm restart, or continue?", "chain_ids": ["cloud-chain-auto-secondary-016-02"], "chain_positions": {"cloud-chain-auto-secondary-016-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-016-02": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2564", "title": "Cloud Gpu Virtualization L2 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How can GPU virtualization improve utilization when 20 models each use less than 10% of a modern GPU?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 0}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2565", "title": "Cloud Gpu Virtualization L3 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What MIG partition scheme should serve a 14 GB 7B model and a 2 GB 1B model on one A100?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 1}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2566", "title": "Cloud Gpu Virtualization L5 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For multi-tenant GPU sharing with workloads from 5% to 60% utilization, how do MIG, MPS, and Kubernetes time-slicing compare?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 3}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2567", "title": "Cloud Tokenization L2 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does the Chinese text tokenize into 350 tokens versus 100 for English, and how does that specifically affect memory and compute costs?", "chain_ids": ["cloud-chain-auto-003-13"], "chain_positions": {"cloud-chain-auto-003-13": 0}, "chain_tiers": {"cloud-chain-auto-003-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2568", "title": "Cloud Tokenization L3 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If a 7B LLM expands from a 32K to 128K vocabulary and cuts token count by 40%, what is the memory trade-off?", "chain_ids": ["cloud-chain-auto-003-13"], "chain_positions": {"cloud-chain-auto-003-13": 1}, "chain_tiers": {"cloud-chain-auto-003-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2569", "title": "Shared vs Language-Specific Tokenizers for a 50-Language LLM", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 50-language LLM, what are the systems trade-offs between one 256K BPE vocabulary and language-specific 32K tokenizers?", "chain_ids": ["cloud-chain-auto-003-13"], "chain_positions": {"cloud-chain-auto-003-13": 2}, "chain_tiers": {"cloud-chain-auto-003-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2570", "title": "Cloud Red Teaming L2 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does red teaming differ from standard model evaluation, and why is it necessary as a systems concern for a chatbot?", "chain_ids": ["cloud-chain-auto-001-08"], "chain_positions": {"cloud-chain-auto-001-08": 0}, "chain_tiers": {"cloud-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2571", "title": "Cloud Red Teaming L3 0", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long and how much would it cost to run 10,000 red-team prompts generating 50 tokens each at 40 tok/s on one GPU?", "chain_ids": ["cloud-chain-auto-001-08"], "chain_positions": {"cloud-chain-auto-001-08": 1}, "chain_tiers": {"cloud-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2572", "title": "Comparing Human, Automated, and Hybrid Red Teaming at 5M Requests per Day", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a model serving 5M requests/day, how do human, automated LLM-based, and hybrid red teaming compare in coverage and cost?", "chain_ids": ["cloud-chain-auto-001-08"], "chain_positions": {"cloud-chain-auto-001-08": 2}, "chain_tiers": {"cloud-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2573", "title": "KV Cache and Attention Costs of Expanding from 4K to 32K Tokens", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the memory, compute, and cost implications of increasing the LLM API token budget from 4K to 32K tokens?", "chain_ids": ["cloud-chain-auto-011-06"], "chain_positions": {"cloud-chain-auto-011-06": 0}, "chain_tiers": {"cloud-chain-auto-011-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2574", "title": "Cloud Token Budget L3 0", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many concurrent requests can the H100 serve at 4K, 16K, and 64K token budgets?", "chain_ids": ["cloud-chain-auto-011-06"], "chain_positions": {"cloud-chain-auto-011-06": 1}, "chain_tiers": {"cloud-chain-auto-011-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2575", "title": "Shared Versus Separate GPU Pools for 1K and 32K Token Requests", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use one shared GPU pool or separate pools for the 1K-token and 32K-token requests, and why?", "chain_ids": ["cloud-chain-auto-011-06"], "chain_positions": {"cloud-chain-auto-011-06": 2}, "chain_tiers": {"cloud-chain-auto-011-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2576", "title": "Why ZeRO-3 Is Overkill for a 7B Model on 4 GPUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are ZeRO-1, ZeRO-2, and ZeRO-3, and why might ZeRO-3 be overkill for a 7B model on 4 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2577", "title": "ZeRO Stage Memory Budget for a 65B AdamW Model on 32 GPUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the per-GPU memory footprint for ZeRO-1, ZeRO-2, and ZeRO-3, and which stage fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2578", "title": "Flat Versus Hierarchical ZeRO Communication for a 30B Model on 64 GPUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which is more communication-efficient for the 30B model: flat ZeRO-3 across 64 GPUs or hierarchical ZeRO, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2579", "title": "Cloud Megatron Parallelism L2 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does Megatron tensor parallelism split adjacent weight matrices, and why does the column-row pattern reduce communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2580", "title": "Cloud Megatron Parallelism L3 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the all-reduce communication volume and NVLink transfer time added by TP=8 per training step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2581", "title": "Cloud Megatron Parallelism L5 0", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which parallelism layout yields better throughput for the 70B model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2582", "title": "QLoRA Memory Budget for Fine-Tuning a 65B Model on a 48 GB GPU", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does QLoRA let a 65B model fit and fine-tune on a single 48 GB GPU?", "chain_ids": ["cloud-chain-auto-008-09"], "chain_positions": {"cloud-chain-auto-008-09": 0}, "chain_tiers": {"cloud-chain-auto-008-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2583", "title": "Cloud Qlora L3 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total QLoRA memory footprint for the 70B model, including adapters, gradients, optimizer states, and activations?", "chain_ids": ["cloud-chain-auto-008-09"], "chain_positions": {"cloud-chain-auto-008-09": 1}, "chain_tiers": {"cloud-chain-auto-008-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2584", "title": "Cost Speed and Quality Tradeoffs for 70B QLoRA Full Fine-Tuning and LoRA", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "With no measured quality scores, which 70B fine-tuning option is the best default balance, and when would you choose QLoRA or full fine-tuning?", "chain_ids": ["cloud-chain-auto-008-09"], "chain_positions": {"cloud-chain-auto-008-09": 2}, "chain_tiers": {"cloud-chain-auto-008-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2585", "title": "Cloud Rlhf Infrastructure L2 0", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can PPO-style RLHF require up to 4x the GPU memory of standard fine-tuning for a 7B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2586", "title": "PPO RLHF VRAM Sizing for a 13B Model on 80 GB GPUs", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory does PPO RLHF for the 13B model require, and what is the theoretical minimum versus a deployment-safe number of 80 GB GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2587", "title": "RLHF VRAM Budgeting for PPO Versus DPO on a 70B Model", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 70B RLHF, which setup should you choose among colocated PPO, separate generation/training clusters, and DPO, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2588", "title": "Cloud Inference Accelerator Selection L2 0", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What technical factors must you critically evaluate before adopting an ASIC claiming a 10x cost reduction over standard GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2589", "title": "A100 vs Inferentia2 Cost per 1K Tokens at Batch 1 and 64", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the cost per 1K tokens on A100 versus Inferentia2 at batch 1 and batch 64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2590", "title": "GPU Heterogeneous and ASIC-First Inference Deployment Tradeoffs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate all-GPU, heterogeneous, and ASIC-first deployments for five diverse model architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2591", "title": "Benchmark Harness Controls for Run-to-Run Variance", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What must a proper ML benchmark harness control to avoid 20% run-to-run variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2592", "title": "Cloud Benchmark Harness", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bias does including the 5 warmup iterations introduce, and what latency should the benchmark report instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2593", "title": "Cloud Benchmark Harness L5 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the vendor speedup at batch 1 and sequence 128 valid for your batch 32, sequence 2048 workload, and how do memory bandwidth and compute throughput factor into the evaluation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2594", "title": "Cloud Benchmark Run Rules L2 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why do MLPerf run rules matter, and why is a custom learning rate schedule outside the rules a problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2595", "title": "MLPerf ResNet-50 Compliance vs Faster Runtime", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Who wins the MLPerf ResNet-50 benchmark, Team A or Team B, and what is the cost difference if both were valid?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2596", "title": "Cloud Benchmark Run Rules L5 0", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you decide between a $500K MLPerf submission and publishing internal benchmarks for your different workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2597", "title": "Cloud Information Entropy L2 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does information entropy quantify the real diversity of this 90%-majority dataset compared to a balanced dataset?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2598", "title": "Zipfian Vocabulary Shannon Entropy", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Under the Zipfian model p(k)=1/(k × H_N) for a 50K-token vocabulary, what is the theoretical minimum bits per token for lossless compression (the Shannon entropy)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2599", "title": "Cloud Information Entropy L5 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you train on the deduplicated 7 TB or the full 10 TB, given 30% deduplication and 15% higher entropy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2600", "title": "Cloud Data Quality As Code L2 0", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How would data quality as code have caught the 15% label corruption before the 3-day training run started?", "chain_ids": ["cloud-chain-auto-secondary-015-26"], "chain_positions": {"cloud-chain-auto-secondary-015-26": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2601", "title": "Cloud Data Quality As Code L3 0", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What time and compute overhead should you expect for schema, null, distribution, and referential integrity checks on 500 GB daily?", "chain_ids": ["cloud-chain-auto-secondary-015-26"], "chain_positions": {"cloud-chain-auto-secondary-015-26": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2602", "title": "Cloud Data Quality As Code L5 0", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which data quality policy is best for a daily production training pipeline, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-26"], "chain_positions": {"cloud-chain-auto-secondary-015-26": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2604", "title": "Cloud Label Consensus L3 0", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the costs and expected quality improvements of adding 2 annotators per image versus expert-reviewing the 30% disagreements?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2607", "title": "AI-Assisted Labeling Throughput Gain", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much labeling cost does AI pre-annotation save on 500K images at 40 versus 120 images per hour and $25/hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2608", "title": "Labeling Strategy Selection Under Budget Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy maximizes labeled data volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2609", "title": "Arithmetic Intensity on the Energy Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does arithmetic intensity determine whether a workload's energy is dominated by FLOPs or data movement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2610", "title": "Computing Energy per Inference from Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total energy and data-movement fraction for 2M FLOPs and 64 KB of HBM traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2611", "title": "Why Federated Learning Sends Gradients Not Data", "topic": "federated-learning", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What federated learning architecture allows training without sharing data, and what makes it privacy-preserving?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2612", "title": "Federated Learning Communication Cost", "topic": "federated-learning", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much bandwidth is required per federated round and over 500 rounds for 100 devices training a 50M-parameter FP32 model?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 1}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2613", "title": "Federated vs Centralized Training Decision", "topic": "federated-learning", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach yields better model quality and why, given that the data is not privacy-sensitive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2614", "title": "TCO Drivers in Federated vs Centralized Training", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the key TCO differences between federated learning across 500 edge nodes and equivalent centralized cloud training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2615", "title": "Federated Learning Break-Even Analysis", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does a positive break-even data volume exist, and what does the cost model show?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2616", "title": "Federated Learning ROI Under Regulatory Constraints", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which has better 2-year ROI?", "chain_ids": ["cloud-chain-auto-016-09"], "chain_positions": {"cloud-chain-auto-016-09": 3}, "chain_tiers": {"cloud-chain-auto-016-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2617", "title": "Gradient Inversion Attacks in Federated Learning", "topic": "federated-learning", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is 'raw data never leaves the device' insufficient for privacy, and what attack vector remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2619", "title": "Privacy-Accuracy Trade-off in Federated Deployment", "topic": "federated-learning", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which configuration should ship?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2620", "title": "Training Memory Footprint Breakdown", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does training require 4-8x more memory than inference, and what are the major memory consumers in a training step?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 0}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2621", "title": "Will This Model Fit in GPU Memory?", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much parameter-related memory is needed, and what is the minimum number of A100-80GB GPUs required ignoring activations?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 1}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2622", "title": "Memory Reduction Strategy Selection", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which memory reduction approach should be used for 7B fine-tuning on a single 80GB GPU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2623", "title": "Why Micro-Benchmarks Mislead at System Level", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can a GEMM micro-benchmark hit near-peak TFLOPS while end-to-end training only achieves 45% MFU?", "chain_ids": ["cloud-chain-auto-011-07"], "chain_positions": {"cloud-chain-auto-011-07": 0}, "chain_tiers": {"cloud-chain-auto-011-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2624", "title": "Isolating Memory Bandwidth via Micro-Benchmark", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 80GB total data transfer in 42ms, what bandwidth is achieved and how does it compare to the hardware's 2.0 TB/s peak?", "chain_ids": ["cloud-chain-auto-011-07"], "chain_positions": {"cloud-chain-auto-011-07": 1}, "chain_tiers": {"cloud-chain-auto-011-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2625", "title": "Diagnosing Performance with Micro vs Macro Benchmarks", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you blame the hardware or the software stack, and what micro-benchmarks would you run to diagnose?", "chain_ids": ["cloud-chain-auto-011-07"], "chain_positions": {"cloud-chain-auto-011-07": 2}, "chain_tiers": {"cloud-chain-auto-011-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2626", "title": "Why Time-to-Train Uses Target Accuracy Not Epochs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does MLPerf Training use time to reach target accuracy rather than time per epoch or time for N steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2627", "title": "Estimating Time-to-Train from Hardware Specs", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 64 A100s at 45% MFU, how long will 3.2e18 FLOPs take to train?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2628", "title": "Evaluating Two Clusters for Training Competition", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which cluster wins on time-to-train, and by what factor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2629", "title": "Why Raw Agreement Percentage Misleads", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is 90% raw labeler agreement insufficient on its own, and what metric should be used to prove true inter-rater reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2631", "title": "Handling Low Agreement in Production Labeling", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy is optimal: relabeling, label smoothing, or reducing the number of classes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2632", "title": "Why nn.Module Uses Parameter Registration", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does PyTorch's nn.Module require explicit nn.Parameter registration instead of treating all tensors as learnable?", "chain_ids": ["cloud-chain-auto-005-16"], "chain_positions": {"cloud-chain-auto-005-16": 0}, "chain_tiers": {"cloud-chain-auto-005-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2633", "title": "Counting Parameters in a Module Hierarchy", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many total parameters does this Embedding, 6-block Transformer, and final Linear model have including biases?", "chain_ids": ["cloud-chain-auto-005-16"], "chain_positions": {"cloud-chain-auto-005-16": 1}, "chain_tiers": {"cloud-chain-auto-005-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2634", "title": "Module Design for Serialization Robustness", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What breaks after Team B refactors the module class, and what serialization pattern should they use instead?", "chain_ids": ["cloud-chain-auto-005-16"], "chain_positions": {"cloud-chain-auto-005-16": 2}, "chain_tiers": {"cloud-chain-auto-005-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2635", "title": "Training Mode Inference with Dropout and BatchNorm", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why do predictions vary between identical inputs when a deployed model is left in training mode, and which layers cause it?", "chain_ids": ["cloud-chain-auto-005-17"], "chain_positions": {"cloud-chain-auto-005-17": 0}, "chain_tiers": {"cloud-chain-auto-005-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2636", "title": "BatchNorm Train vs Eval Statistics", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What value does BatchNorm use for normalization in train mode vs eval mode?", "chain_ids": ["cloud-chain-auto-005-17"], "chain_positions": {"cloud-chain-auto-005-17": 1}, "chain_tiers": {"cloud-chain-auto-005-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2637", "title": "Diagnosing Train-Eval Performance Gap", "topic": "data-efficiency-selection", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With model.eval() correctly called, what is the likely cause of the 92% training to 78% deployment accuracy drop and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2638", "title": "Pure Functions Enable Composable Transforms", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does JAX require pure functions, and how does that enable transformations like vmap, jit, and grad?", "chain_ids": ["cloud-chain-auto-005-18"], "chain_positions": {"cloud-chain-auto-005-18": 0}, "chain_tiers": {"cloud-chain-auto-005-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2639", "title": "Vectorization Speedup via vmap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If vmap(grad(loss_fn)) vectorizes 256 per-sample gradients and the GPU has enough parallelism, what speedup and time should you expect?", "chain_ids": ["cloud-chain-auto-005-18"], "chain_positions": {"cloud-chain-auto-005-18": 1}, "chain_tiers": {"cloud-chain-auto-005-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2640", "title": "JAX vs PyTorch for Per-Sample Gradients and Higher-Order Derivatives", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the team switch to JAX for per-sample gradients, higher-order derivatives, and auto-vectorization, and what are the trade-offs?", "chain_ids": ["cloud-chain-auto-005-18"], "chain_positions": {"cloud-chain-auto-005-18": 2}, "chain_tiers": {"cloud-chain-auto-005-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2641", "title": "The Ridge Point and Hardware Balance", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the A100 and H100 hardware balance points, and what do they imply for workload optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2642", "title": "Determining Bottleneck from Hardware Balance", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is this compute-bound or memory-bound on an accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2644", "title": "The ML Test Score Framework", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does the ML Test Score measure, and why is it designed to assess technical debt rather than model accuracy?", "chain_ids": ["cloud-chain-auto-011-10"], "chain_positions": {"cloud-chain-auto-011-10": 0}, "chain_tiers": {"cloud-chain-auto-011-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2645", "title": "Scoring a Production ML System", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the team's ML Test Score, and how long will it take to reach a production-ready score of 20?", "chain_ids": ["cloud-chain-auto-011-10"], "chain_positions": {"cloud-chain-auto-011-10": 1}, "chain_tiers": {"cloud-chain-auto-011-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2646", "title": "Prioritizing ML Test Score Improvements", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which investment maximizes operational reliability and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2649", "title": "When Level 2 MLOps Is Premature", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the startup invest 6 months building Level 2 infrastructure?", "chain_ids": ["cloud-chain-auto-011-10"], "chain_positions": {"cloud-chain-auto-011-10": 2}, "chain_tiers": {"cloud-chain-auto-011-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2650", "title": "How Pipeline Jungles Form", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does an ML data pipeline evolve into a pipeline jungle, and why is that more dangerous than regular code complexity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2651", "title": "Pipeline Jungle Impact on Iteration Speed", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "When does the 3-month Airflow rewrite pay for itself given the current and post-rewrite feature-addition costs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2652", "title": "Pipeline Jungle Remediation Strategy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which remediation approach minimizes production risk while successfully detangling the rigid dependencies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2653", "title": "Why Glue Code Dominates ML Systems", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is only about 5% of many ML systems actual model code, and what makes up the remaining glue code?", "chain_ids": ["cloud-chain-auto-001-09"], "chain_positions": {"cloud-chain-auto-001-09": 0}, "chain_tiers": {"cloud-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2654", "title": "Quantifying Glue Code Maintenance Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many monthly maintenance hours come from the 500 lines of model code and 8,000 lines of glue code?", "chain_ids": ["cloud-chain-auto-001-09"], "chain_positions": {"cloud-chain-auto-001-09": 1}, "chain_tiers": {"cloud-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2655", "title": "Credit Scoring AutoML vs Custom Model Decision", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should they do it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2656", "title": "How Undeclared Data Dependencies Cause Silent Failures", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can changing user_engagement_score from a 30-day to a 7-day rolling average cause a silent model failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2657", "title": "Feature Drift Alerting with Multiple-Comparison Correction", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does this 0.04 shift trigger a 2-sigma alert, and how should you adjust for monitoring 50 features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2658", "title": "Preventing Undeclared Dependencies at Scale", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy best prevents silent failures at scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2659", "title": "Shadow Mode vs A/B Testing vs Canary Releases", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the three main online evaluation strategies (Shadow, Canary, A/B), when is each appropriate, and what risk does each mitigate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2661", "title": "Choosing Evaluation Strategy for a Safety-Critical Model", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which online evaluation strategy should you use for this safety-critical model, given the extreme cost asymmetry between false negatives and false positives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2662", "title": "How Request Pipelining Hides Latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For the 5ms/10ms/3ms stages, what throughput, latency, and GPU utilization do you get without pipelining versus with 3-stage pipelining?", "chain_ids": ["cloud-chain-auto-001-11"], "chain_positions": {"cloud-chain-auto-001-11": 0}, "chain_tiers": {"cloud-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2663", "title": "Pipeline Throughput with Unbalanced Stages", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 100 concurrent requests in the 2ms/3ms/15ms/1ms pipeline, what is the steady-state throughput and bottleneck?", "chain_ids": ["cloud-chain-auto-001-11"], "chain_positions": {"cloud-chain-auto-001-11": 1}, "chain_tiers": {"cloud-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2664", "title": "Pipelining vs Batching Trade-off", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which serving optimization should be chosen to meet p99 < 50 ms while maximizing throughput headroom?", "chain_ids": ["cloud-chain-auto-001-11"], "chain_positions": {"cloud-chain-auto-001-11": 2}, "chain_tiers": {"cloud-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2665", "title": "Fan-out Tail Latency Amplification at p99", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is the user-facing p99 much worse than the single-shard p99, and which single-shard percentile determines the user-facing p99?", "chain_ids": ["cloud-chain-auto-001-12"], "chain_positions": {"cloud-chain-auto-001-12": 0}, "chain_tiers": {"cloud-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2666", "title": "Hedged Request Latency Improvement", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming independent latency distributions, what is the approximate new p99 latency and extra backend load?", "chain_ids": ["cloud-chain-auto-001-12"], "chain_positions": {"cloud-chain-auto-001-12": 1}, "chain_tiers": {"cloud-chain-auto-001-12": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2667", "title": "Tail Latency Mitigation Strategy Selection", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which approach provides the most cost-effective and operationally sound tail latency mitigation strategy?", "chain_ids": ["cloud-chain-auto-001-12"], "chain_positions": {"cloud-chain-auto-001-12": 2}, "chain_tiers": {"cloud-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2668", "title": "Model Caching in Multi-Model Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is model caching needed when 200 models share VRAM that holds only 20, and how does it differ from traditional web caching?", "chain_ids": ["cloud-chain-auto-001-10"], "chain_positions": {"cloud-chain-auto-001-10": 0}, "chain_tiers": {"cloud-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2669", "title": "Model Cache Hit Rate and Latency Impact", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 80GB VRAM and cache space allocated roughly proportional to traffic, how many models of each size fit, and what hit rate and average latency result?", "chain_ids": ["cloud-chain-auto-001-10"], "chain_positions": {"cloud-chain-auto-001-10": 1}, "chain_tiers": {"cloud-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2670", "title": "Model Caching Strategy Under Cost Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architecture minimizes cost while maintaining a strict p99 < 200ms SLA?", "chain_ids": ["cloud-chain-auto-001-10"], "chain_positions": {"cloud-chain-auto-001-10": 2}, "chain_tiers": {"cloud-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2671", "title": "Why Transformers Need Positional Encoding", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does a Transformer without positional encoding treat word-order permutations as equivalent, and why is this architecturally inevitable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2672", "title": "Positional Encoding Memory Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the memory cost of learned positional embeddings for 8192 positions and 4096 FP16 dimensions, and how does it compare to a 7B parameter model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2673", "title": "Choosing a Positional Encoding for Long-Context LLM", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which positional encoding approach (Absolute, RoPE+NTK, or ALiBi) should you choose for a 128K context LLM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2674", "title": "Receptive Field Growth in Deep CNNs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a CNN's receptive field grow with depth, and why does that matter for detecting objects of different sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2677", "title": "Compute Cost of Adding Network Depth", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much do inference time and parameter count increase when adding 5 identical 256-filter 3×3 layers to the 10-layer CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2678", "title": "Depth vs Width for Limited Compute Budget", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture achieves better accuracy under the 1 GFLOP constraint, and what structural components are necessary for it to train successfully?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2679", "title": "How Recommendation Systems Create Feedback Loops", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How can monthly retraining a recommender on its own engagement data create a runaway feedback loop, and why is it hard to detect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2680", "title": "Quantifying Feedback Loop Amplification", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After 3 retraining cycles with +/- 20% observation-bias updates, what final observed crime rates does the model learn for the top-10 and bottom-90 neighborhoods, and what disparity results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2681", "title": "Ad-Targeting Exploration Policy for Feedback Loops", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which production exploration strategy should break the ad-targeting feedback loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2682", "title": "Graceful Degradation in ML Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why do production ML systems need fallback strategies, and what is the typical fallback hierarchy from most to least sophisticated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2683", "title": "Fallback Availability Impact on SLA", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 99.5%, 99.9%, and 99.99% availability layers, what is overall system availability assuming independent failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2684", "title": "Why Backward Pass is 2x Forward", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does the backward pass require about twice the FLOPs of the forward pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2685", "title": "Estimating Total Training FLOPs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using 6ND, what are the total training FLOPs for a 7B model on 1T tokens and the training time on 256 A100s at 50% MFU?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 1}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2686", "title": "Compute-Optimal Model/Data Allocation Under Chinchilla Scaling Laws", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option is closest to Chinchilla compute-optimal training, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2687", "title": "Physical Limits on Training Cluster Scale", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why can't you keep halving training time by doubling GPUs, and what three physical ceilings limit cluster scaling?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 0}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2688", "title": "Sparse Gradient AllReduce Efficiency for a 10B Model", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much does 90% sparse gradient AllReduce reduce communication time and training efficiency versus the dense 20GB baseline?", "chain_ids": ["cloud-chain-auto-005-14"], "chain_positions": {"cloud-chain-auto-005-14": 0}, "chain_tiers": {"cloud-chain-auto-005-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2689", "title": "Scaling Strategy Under Physical Constraints", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which scaling strategy maximizes time-to-train reduction under the physical facility constraints?", "chain_ids": ["cloud-chain-auto-005-08"], "chain_positions": {"cloud-chain-auto-005-08": 3}, "chain_tiers": {"cloud-chain-auto-005-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2690", "title": "Recursive Halving-Doubling vs Ring AllReduce", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does recursive halving-doubling achieve AllReduce in O(log N) steps versus ring AllReduce's O(N), and when is each preferred?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 1}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2692", "title": "Choosing AllReduce for 20GB Gradients vs 1KB Scalars", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use different algorithms for each?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 4}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2693", "title": "Why Standard Optimizers Fail with Compressed Gradients", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does naive gradient compression such as top-k sparsification or 1-bit quantization degrade convergence with Adam?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2694", "title": "Communication Savings from 1-Bit Adam", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 1-bit Adam on a 10B model across 64 GPUs, what communication reduction and AllReduce time at 200 Gb/s do you get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2695", "title": "Compression Strategy for Bandwidth-Constrained Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach best balances convergence and throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2696", "title": "Why GPUDirect Storage Eliminates the CPU Bounce Buffer", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does GPUDirect Storage bypass CPU RAM, and why does that improve training data throughput?", "chain_ids": ["cloud-chain-auto-secondary-001-03"], "chain_positions": {"cloud-chain-auto-secondary-001-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-001-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2697", "title": "Data Loading Bottleneck Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the epoch times with and without GPUDirect Storage, and what is the true hardware bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-001-03"], "chain_positions": {"cloud-chain-auto-secondary-001-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-001-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2698", "title": "Storage Architecture for Large-Scale Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which storage architecture provides the best throughput for large-scale training, and why do the others fail?", "chain_ids": ["cloud-chain-auto-secondary-001-03"], "chain_positions": {"cloud-chain-auto-secondary-001-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-001-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2699", "title": "SSP Staleness Bound Intuition", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What does Stale Synchronous Parallel (SSP) actually guarantee, and why does its staleness bound matter for convergence?", "chain_ids": ["cloud-chain-auto-013-08"], "chain_positions": {"cloud-chain-auto-013-08": 0}, "chain_tiers": {"cloud-chain-auto-013-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2700", "title": "SSP Throughput vs BSP", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With BSP at 180 ms due to persistent stragglers, what throughput improvement should SSP with S=5 provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2701", "title": "Synchronous FSDP Straggler Mitigation at 256 GPUs", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 256-GPU synchronous FSDP training of a 7B LLM, which mitigation strategy (pure sync, sync + backup workers, or bounded-staleness async) preserves convergence with the smallest throughput cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2702", "title": "Linear Scaling Rule Intuition", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does scaling from 8 to 64 GPUs with an 8x larger global batch require increasing the learning rate?", "chain_ids": ["cloud-chain-auto-013-10"], "chain_positions": {"cloud-chain-auto-013-10": 0}, "chain_tiers": {"cloud-chain-auto-013-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2703", "title": "LR Warmup Duration Calculation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the target learning rate and the number of warmup steps?", "chain_ids": ["cloud-chain-auto-013-10"], "chain_positions": {"cloud-chain-auto-013-10": 1}, "chain_tiers": {"cloud-chain-auto-013-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2704", "title": "When Linear Scaling Breaks Down", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For ResNet-50 at batch size 65,536 diverging after warmup, should you use sqrt LR scaling, LARS, or reduce batch size?", "chain_ids": ["cloud-chain-auto-013-10"], "chain_positions": {"cloud-chain-auto-013-10": 2}, "chain_tiers": {"cloud-chain-auto-013-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2705", "title": "Critical Batch Size Concept", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is critical batch size, and why does doubling GPUs not guarantee a 2x training speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2706", "title": "Compute Efficiency Beyond Critical Batch Size", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "At B_crit=4096, what time savings and compute efficiency should you expect when scaling to B=16384 on 64 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2707", "title": "Budget Allocation Near Critical Batch Size", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With a $500K budget, no 2-week deadline, and both options reaching the same final loss, should you choose 128 GPUs at B=32K for 2 weeks or 32 GPUs at B=8K for 6 weeks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2708", "title": "Parameter Server vs AllReduce", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the key architectural difference between parameter servers and AllReduce, and why do modern frameworks favor AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2709", "title": "PS Bandwidth Bottleneck", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 32 workers, 4 parameter servers, 100 Gbps Ethernet, and 2GB gradients per worker, what is the minimum push time and bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2710", "title": "PS for Sparse Recommendation Models", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a recommendation model with a 500GB embedding table and under 1% access per batch, should you use AllReduce sharding or parameter servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2711", "title": "Data Parallelism Around an Expert-Parallel MoE Block", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "When DP wraps an already expert-parallel 64-GPU MoE block, what does the DP layer actually replicate, and what AllReduce cost does it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2713", "title": "Expert Load Imbalance Diagnosis", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 64-expert top-2 MoE with 40% MFU and highly skewed expert loads, should you add load loss, raise capacity, or use hash routing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2714", "title": "Bathtub Curve Phases", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "When should you expect the highest failure rates, and what are the three phases of the bathtub curve?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2715", "title": "Burn-in Cost vs Failure Cost", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should you burn-in?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2716", "title": "Fleet Refresh Timing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 3-year-old 2,000-GPU cluster entering wear-out, should you replace the fleet now, replace failures reactively, or roll replacements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2717", "title": "MTBF vs Cluster MTBF", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "With a 100,000-hour per-GPU MTBF and 1,000 GPUs, why is cluster-level MTBF dramatically lower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2718", "title": "Optimal Checkpoint Interval from MTBF", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Using the Young/Daly formula, what is the optimal checkpoint frequency for 512 GPUs with a 50,000-hour per-GPU MTBF?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2719", "title": "MTBF-Aware Cluster Sizing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For training a 70B model on 1024 GPUs for 14 days or 512 GPUs for 28 days, which loses less time to failures and checkpointing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2720", "title": "Permanent vs Transient Faults", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What type of fault is a GPU matmul error that persists after process restart but disappears on another GPU, and why does it matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2721", "title": "Permanent Fault Impact on Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "After 72 hours of a 256-GPU run with 6-hour checkpoints, how long does recovery take from a permanent GPU fault?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2722", "title": "Silent Data Corruption Strategy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "After 18 hours of silent data corruption from a permanent GPU fault, should you roll back, continue, or add online gradient checksumming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2723", "title": "Why Intermittent Faults Are Hardest", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why are sporadic crashes on GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2725", "title": "Intermittent Fault Policy Design", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 20 GPUs with intermittent faults in a 2,000-GPU cluster, should you replace all, auto-quarantine after three strikes, or add hot spares?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2726", "title": "What Causes a Checkpoint Storm", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is a checkpoint storm, and why does checkpointing on 1,024 GPUs take 30 minutes instead of 5?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 1}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2727", "title": "Checkpoint Storm I/O Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For a 512-GPU FSDP checkpoint with 280MB shards and 200 GB/s Lustre, what write time do you expect with and without staggered writes?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 2}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2728", "title": "Checkpoint Storm Mitigation Strategy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 50 jobs causing 15-minute filesystem brownouts during checkpoints, should you stagger schedules, use local NVMe plus async copy, or add storage?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 4}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2729", "title": "Five Levels of ML Networking", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the five-level networking model, and why does each level matter for ML cluster workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2730", "title": "Fat-Tree Bisection and AllReduce Bandwidth", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using bidirectional aggregate bisection, and one-way effective per-node AllReduce bandwidth, what bandwidths does this fat-tree provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2731", "title": "Fat-Tree RDMA AllReduce ECMP Congestion Diagnosis", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "In a full-bisection 400G RDMA fat-tree with AllReduce at 40% of theoretical, where might the bottleneck be and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2732", "title": "Link Budget Fundamentals", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is a link budget, and why does it determine whether 400G can run over 5 meters of passive copper?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2735", "title": "Failure Domain Concept", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is a failure domain, and why does a ToR switch taking down 16 GPU nodes matter for ML job placement?", "chain_ids": ["cloud-chain-auto-004-08"], "chain_positions": {"cloud-chain-auto-004-08": 0}, "chain_tiers": {"cloud-chain-auto-004-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2736", "title": "Failure Domain Probability", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the probability of at least one ToR failure during a 30-day 128-GPU training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2737", "title": "Failure Domain vs Locality Trade-off", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 64-GPU job, should you place all 8 nodes in one rack for 30% faster AllReduce or spread them across 4 racks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2738", "title": "ML Technical Debt Sources", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is ML technical debt, and what sources make it different from traditional software technical debt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2739", "title": "Technical Debt Maintenance Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the refactoring break-even point and annual savings, and should the team prioritize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2741", "title": "Capacity Planning Fundamentals", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is doubling the GPU budget not enough when model count grows from 50 to 100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2742", "title": "Training vs Serving GPU Split", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What minimum GPU allocation is required for training and serving at a 70% utilization target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2743", "title": "On-Prem vs Cloud Capacity Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which of the on-prem (300 GPUs), hybrid burst (100 on-prem + 200 cloud), or all-cloud options minimizes 3-year cost, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2744", "title": "Continuous Training Triggers", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What should trigger continuous training for the declining CTR model, and how would it reduce staleness?", "chain_ids": ["cloud-chain-auto-011-11"], "chain_positions": {"cloud-chain-auto-011-11": 0}, "chain_tiers": {"cloud-chain-auto-011-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2745", "title": "Continuous Training Cost Model", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the weekly GPU cost change, and does continuous training save money once monitoring, data pipeline, and validation overhead are included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2746", "title": "Continuous Training Safety", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate shadow testing, human approval, and automated canaries for this 3x/week auto-deploy pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2747", "title": "Validation Gate Purpose", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why should the new model pass automated validation gates beyond a 2% aggregate accuracy gain before deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2748", "title": "Validation Gate Latency Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the weekly validation time sequentially, and how would you parallelize the 5 gates to cut wall time by at least half?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2750", "title": "Monitoring Hierarchy Layers", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would hierarchical monitoring help triage the 15% revenue drop when system metrics are green?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2752", "title": "Monitoring Architecture for Multi-Model Fleet", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design monitoring for 500 models, and which of per-model dashboards, anomaly detection, or SLOs should be primary?", "chain_ids": ["cloud-chain-auto-001-13"], "chain_positions": {"cloud-chain-auto-001-13": 2}, "chain_tiers": {"cloud-chain-auto-001-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2753", "title": "Alert Fatigue in ML Systems", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is alert fatigue, and why did 200+ daily alerts make the 12-hour missed ML degradation likely?", "chain_ids": ["cloud-chain-auto-001-13"], "chain_positions": {"cloud-chain-auto-001-13": 0}, "chain_tiers": {"cloud-chain-auto-001-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2754", "title": "Daily Alert Reduction From Noisy Rule Anomaly Detection", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many of the 200 daily alerts are reduced by moving the 3 noisy rules that generate 80% of alerts to anomaly detection?", "chain_ids": ["cloud-chain-auto-001-13"], "chain_positions": {"cloud-chain-auto-001-13": 1}, "chain_tiers": {"cloud-chain-auto-001-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2756", "title": "NAS Search Space and Cost", "topic": "neural-architecture-search", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does NAS search-space design matter more than the search algorithm, and why is NAS so computationally expensive?", "chain_ids": ["cloud-chain-auto-secondary-015-09"], "chain_positions": {"cloud-chain-auto-secondary-015-09": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2757", "title": "NAS Compute Budget", "topic": "neural-architecture-search", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the total GPU-hours and dollar cost of this weight-sharing NAS run at $3 per GPU-hour?", "chain_ids": ["cloud-chain-auto-secondary-015-09"], "chain_positions": {"cloud-chain-auto-secondary-015-09": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2758", "title": "NAS vs Manual Design", "topic": "neural-architecture-search", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With 2 weeks, 1000 GPU-hours, and a 200ms edge latency limit, which model-selection option would you choose and why?", "chain_ids": ["cloud-chain-auto-secondary-015-09"], "chain_positions": {"cloud-chain-auto-secondary-015-09": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2759", "title": "Recommender Feedback Loops and Filter Bubbles", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How did the recommender's training feedback loop form, and why does it produce filter bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2761", "title": "Lending Feedback Loop Fairness Mitigation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate exploration, causal debiasing, and demographic parity for mitigating the lending feedback loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2762", "title": "Guardrail Architecture Layers", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What guardrails would prevent medical advice, pricing leaks, and off-topic replies, and where do they sit in the pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2763", "title": "Guardrail Latency Budget", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Do the input/output classifiers and regex filter fit within the 2-second chatbot latency budget, and how can you optimize them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2765", "title": "Why OOD Detection Matters", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is OOD detection needed for nighttime infrared images, and why is high softmax confidence insufficient?", "chain_ids": ["cloud-chain-auto-secondary-017-10"], "chain_positions": {"cloud-chain-auto-secondary-017-10": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2767", "title": "OOD Detection Strategy Selection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you use temperature scaling, deep ensembles, or feature-density estimation for safety-critical OOD detection, and why?", "chain_ids": ["cloud-chain-auto-secondary-017-10"], "chain_positions": {"cloud-chain-auto-secondary-017-10": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2770", "title": "Prompt Injection Defense Architecture", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compare classifier filtering, sandboxed tools with human review, and instruction hierarchy for prompt injection defense?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2771", "title": "ML Attack Surface", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is the ML model's threat model larger than a traditional software threat model, and what ML-specific attacks should it cover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2772", "title": "Threat Model Risk Matrix", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What four ML-specific threats would you include for the fraud detector, and how would you score their likelihood, impact, and risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2774", "title": "How Adversarial Evasion Works", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How do adversarial stickers make a vision model misclassify a stop sign, and why can small perturbations flip predictions?", "chain_ids": ["cloud-chain-auto-secondary-015-21"], "chain_positions": {"cloud-chain-auto-secondary-015-21": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2775", "title": "Adversarial Training Compute Overhead: Adversarial Robustness & Security", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much longer and more expensive will 10-step PGD adversarial training be than the 24-hour standard run?", "chain_ids": ["cloud-chain-auto-secondary-015-21"], "chain_positions": {"cloud-chain-auto-secondary-015-21": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2776", "title": "Adversarial Defense Selection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which defenses should you use against structured-feature evasion attacks in production fraud detection, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-21"], "chain_positions": {"cloud-chain-auto-secondary-015-21": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2777", "title": "System Prompt Extraction Risk", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why is system prompt leakage a security risk, and why is telling the LLM not to reveal its prompt insufficient?", "chain_ids": ["cloud-chain-auto-004-13"], "chain_positions": {"cloud-chain-auto-004-13": 0}, "chain_tiers": {"cloud-chain-auto-004-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2778", "title": "Injection Detection Accuracy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "At 100,000 queries/day with a 0.1% attack rate, what are the classifier's daily true positives, false positives, and precision?", "chain_ids": ["cloud-chain-auto-004-13"], "chain_positions": {"cloud-chain-auto-004-13": 1}, "chain_tiers": {"cloud-chain-auto-004-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2779", "title": "LLM Injection Defense Architecture", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you evaluate fine-tuning, dual-instance isolation, and classifiers for prompt injection in a RAG application?", "chain_ids": ["cloud-chain-auto-004-13"], "chain_positions": {"cloud-chain-auto-004-13": 2}, "chain_tiers": {"cloud-chain-auto-004-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2780", "title": "WUE Metric Explained", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is Water Usage Effectiveness (WUE), and why is cooling AI training clusters a growing water sustainability concern?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 0}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2782", "title": "Cooling Strategy for Water-Scarce Region", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 10 MW IT-load datacenter in a water-scarce desert, which cooling option provides the best TCO and sustainability balance, and why?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 3}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2783", "title": "Embodied Carbon Concept", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is embodied carbon, and why does extending AI hardware lifespan effectively reduce annualized emissions?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 0}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2784", "title": "Fleet Lifecycle Carbon Calculation", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 1,000 A100 fleet, what is the annualized total carbon under 3-year versus 5-year replacement cycles?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 1}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2785", "title": "Upgrade vs Extend Hardware Lifecycle", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 1,000-GPU fleet, which replacement option minimizes the 5-year total carbon footprint, and why?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 3}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2786", "title": "The Divergence Problem", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why won't Moore's Law keep AI energy consumption in check as model compute demand grows?", "chain_ids": ["cloud-chain-auto-secondary-009-17"], "chain_positions": {"cloud-chain-auto-secondary-009-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2787", "title": "Energy Gap Projection", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What power will an equivalent 2030 frontier training run require, and how does it compare to a 100 MW datacenter?", "chain_ids": ["cloud-chain-auto-secondary-009-17"], "chain_positions": {"cloud-chain-auto-secondary-009-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2788", "title": "Strategies Against Divergent Scaling", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate on-site renewables, algorithmic efficiency, and power-abundant regions for this power-scaling problem?", "chain_ids": ["cloud-chain-auto-secondary-009-17"], "chain_positions": {"cloud-chain-auto-secondary-009-17": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2789", "title": "500B Model 3D Parallelism on 1024 80GB GPUs", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What 3D parallelism strategy would you use to train the 500B dense model to maximize hardware utilization and minimize communication bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2790", "title": "Asynchronous Checkpointing for 175B Training on 256 GPUs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design asynchronous checkpointing for a 175B model on 256 H100s without stalling training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2791", "title": "Cloud New 0004", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a tiered KV-cache for a 70B LLM with 100k context on an 8x H100 node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2792", "title": "Hybrid AOT JIT Compilation for Dynamic 13B Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a hybrid AOT/JIT compiler for a 13B model with sequence lengths from 10 to 8000?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 4}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2793", "title": "Cloud New 0006", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you map expert parallelism for a 300B MoE model across 128 nodes to avoid all-to-all congestion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2794", "title": "Disaggregated A100 Prefill and T4 Decode for 30B LLMs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design disaggregated serving for a 30B LLM using A100s for prefill and T4s for decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2796", "title": "Cloud New 0009", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a decoupled H100 serving pipeline for a vision encoder feeding an autoregressive LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2798", "title": "Cloud New 0013", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you compile and optimize a 7B model for real-time inference on Orin devices with 275 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2799", "title": "Privacy-Preserving Cross-Tenant Data-Parallel Aggregation", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect privacy-preserving 3-region data-parallel aggregation for a 70B model on 192 H100s with secure cross-region averaging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2800", "title": "Serverless 7B Inference on A100 with PCIe Gen4 Weight Offload", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design serverless 7B LLM inference on idle A100s with low cold-starts using PCIe Gen4 weight offload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2801", "title": "Silent Data Corruption Detection for 10,000-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you detect silent data corruption and hardware degradation across 10,000 GPUs without interrupting training?", "chain_ids": ["cloud-chain-auto-004-04"], "chain_positions": {"cloud-chain-auto-004-04": 3}, "chain_tiers": {"cloud-chain-auto-004-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2802", "title": "Sequence Parallelism for 100k Contexts on 1T Models", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you combine sequence parallelism with Megatron-style training to handle 100k contexts for a 1T model without hitting HBM3 limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2803", "title": "Fault-Tolerant Ring All-Reduce over Flaky 400Gbps InfiniBand", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you make Ring All-Reduce gradient synchronization resilient to flaky 400Gbps IB links for a 50B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2804", "title": "Optimizing MoE Routing Kernels for Sparse Accesses on H100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you auto-tune custom MoE routing kernels on H100s to maximize HBM3 bandwidth for sparse accesses?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 3}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2805", "title": "Heterogeneous H100 and Spot V100 Training Scheduler", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule 100B training across reserved H100s and volatile spot V100s to reduce cost while guaranteeing progress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2806", "title": "Low-Overhead Tracing for Checkpoint Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you trace checkpoint and recovery stalls in a 70B A100 training job without inflating P99 latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2807", "title": "Heterogeneous-Bandwidth Data-Parallel AllReduce Architecture", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect data-parallel AllReduce on 128 A100s split between an InfiniBand HDR island and an Ethernet half so synchronization fits inside a 380 ms backward pass?", "chain_ids": ["cloud-chain-auto-013-07"], "chain_positions": {"cloud-chain-auto-013-07": 5}, "chain_tiers": {"cloud-chain-auto-013-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2808", "title": "Pipeline Parallel 7B Serving over PCIe Gen3", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a pipeline parallel serving architecture to maximize throughput and minimize latency for a 7B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2809", "title": "Cloud New 0027", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a shared KV-cache for 1000s of AI NPC users who share a world-state prompt but branch into unique dialogues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2810", "title": "Preemptible LLM KV-Cache Checkpointing and Migration", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you migrate KV-cache state for preemptible H100 LLM serving without losing sessions or causing large latency spikes?", "chain_ids": ["cloud-chain-auto-004-07"], "chain_positions": {"cloud-chain-auto-004-07": 3}, "chain_tiers": {"cloud-chain-auto-004-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2811", "title": "Pipeline Parallelism across Asymmetric Clusters", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you fine-tune a 65B model across 4 V100 32GB nodes and 2 A100 80GB nodes without stragglers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2812", "title": "Auto-Scaling and Routing for 7B Chat Model on Mixed T4 Fleet", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you autoscale and route traffic for a 7B chat model with 10x daily swings using reserved and spot T4s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2819", "title": "NVLink vs PCIe Gen4 Bandwidth on A100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the primary use cases and bandwidths of NVLink versus PCIe Gen4 on an A100 system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2821", "title": "Cloud New 0011", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is Data Parallelism, and what happens to the model weights across GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2837", "title": "Cloud New 0031", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is Operator Fusion, and why does it improve performance for memory-bound workloads?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2838", "title": "Cloud New 0032", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is JIT compilation, and how does it differ from AOT compilation in PyTorch-style ML workloads?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 0}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2839", "title": "Cloud New 0033", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does XLA stand for, and what is its primary purpose in an ML stack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2840", "title": "Cloud New 0034", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is Constant Folding, and how does it simplify a computational graph before inference?", "chain_ids": ["cloud-chain-auto-005-04"], "chain_positions": {"cloud-chain-auto-005-04": 0}, "chain_tiers": {"cloud-chain-auto-005-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2841", "title": "Cloud New 0035", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is auto-tuning in ML compilation, and how does it improve kernel performance?", "chain_ids": ["cloud-chain-auto-005-11"], "chain_positions": {"cloud-chain-auto-005-11": 0}, "chain_tiers": {"cloud-chain-auto-005-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2842", "title": "Asynchronous Checkpointing vs Synchronous Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is asynchronous checkpointing, and how does it mitigate compute stalls compared to synchronous checkpointing?", "chain_ids": ["cloud-chain-auto-004-02"], "chain_positions": {"cloud-chain-auto-004-02": 0}, "chain_tiers": {"cloud-chain-auto-004-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2843", "title": "Cloud New 0038", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a heartbeat mechanism, and how does a cluster orchestrator detect a dead node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2844", "title": "Cloud New 0039", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Define Elastic Training, and state its primary benefit when a node fails in the middle of a distributed job?", "chain_ids": ["cloud-chain-auto-004-05"], "chain_positions": {"cloud-chain-auto-004-05": 0}, "chain_tiers": {"cloud-chain-auto-004-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2845", "title": "Pipeline Bubble in Pipeline Parallelism", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the pipeline bubble in pipeline-parallel training, and why does it occur during the initial forward pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2847", "title": "Operating Point on the Queueing Hockey-Stick", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Using the M/M/1 queueing model, what utilization brings P99 latency under the 50 ms SLO, and why does latency spike as utilization approaches 1?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 3}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2849", "title": "Incast Network Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the 28ms latency based on the network bandwidth constraints of the aggregator?", "visual": {"kind": "svg", "path": "cloud-2849.svg", "alt": "A diagram showing multiple worker nodes simultaneously sending 200 MB of data to a single aggregator node, creating an incast bottleneck at the aggregator's 400 Gbps NIC.", "caption": "Simultaneous fan-in (incast) traffic overwhelming a single receiver link."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2850", "title": "MoE Topology Oversubscription", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the inter-node All-to-All latency for both the 1:1 and 2:1 topologies, and specify whether the 2:1 design meets a strict 50 ms communication SLO?", "visual": {"kind": "svg", "path": "cloud-2850.svg", "alt": "A two-tier leaf-spine network topology diagram showing 2 spine switches connected to 4 leaf switches, with each leaf switch connected to 2 compute nodes (8 GPUs each), illustrating a 2:1 oversubscription ratio.", "caption": "2:1 Oversubscribed Leaf-Spine Interconnect Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2851", "title": "Raw Tensor Network Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the data loading throughput to diagnose the root cause of the low utilization?", "visual": {"kind": "svg", "path": "cloud-2851.svg", "alt": "Bar chart comparing 6.25 GB/s network capacity against 32.2 GB/s required for raw FP16 images, and ~1 GB/s for JPEG.", "caption": "Throughput limits and demands for the data loading pipeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2852", "title": "Checkpoint Efficiency and SLA Targets", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether the current storage backend can meet the 90% efficiency SLA using optimal checkpointing, and if not, specify the minimum write bandwidth required?", "visual": {"kind": "svg", "path": "cloud-2852.svg", "alt": "Timeline diagram illustrating normal operation with checkpoints, a crash failure, the lost work interval (RPO), and the recovery time (RTO).", "caption": "Checkpointing timeline showing the tradeoff between interval frequency (T) and the lost work upon failure."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2854", "title": "HBM Cache Hit Rate", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the minimum cache hit rate required in HBM to achieve an effective average embedding lookup bandwidth of at least 1.6 TB/s?", "visual": {"kind": "svg", "path": "cloud-2854.svg", "alt": "A bar chart comparing CPU DRAM bandwidth at 64 GB/s, effective target at 1600 GB/s, and HBM3 at 3200 GB/s on a logarithmic scale.", "caption": "Memory hierarchy bandwidth tiering comparison."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2857", "title": "M/M/1 Accelerator Queuing", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At what arrival rate will the average response time spike to exactly 40 ms?", "visual": {"kind": "svg", "path": "cloud-2857.svg", "alt": "A hockey-stick line graph of an M/M/1 queue showing average latency exponentially rising as arrival rate approaches 150 requests per second.", "caption": "Queueing theory hockey-stick curve for M/M/1 latency."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2859", "title": "Multimodal Pipeline Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Calculate the required S3 network bandwidth in GB/s and determine the minimum number of CPU cores required if each core decodes 200 images/second?", "visual": {"kind": "svg", "path": "cloud-2859.svg", "alt": "Bar chart comparing the throughput capacities of S3 read, CPU decode, PCIe Gen5 transfer, and GPU processing stages against a target horizontal line.", "caption": "Data Pipeline Throughput vs Bottleneck Target"}, "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 4}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2860", "title": "Cloud GPU Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a duty-cycling schedule with a 1-hour buffer around the peak, and compute the daily kWh saved compared to keeping all 100 GPUs fully active 24/7?", "visual": {"kind": "svg", "path": "cloud-2860.svg", "alt": "A step plot showing the number of active GPUs increasing before a traffic peak and dropping immediately after, overlaid on a smooth traffic demand curve.", "caption": "Proactive Duty Cycling Over 24 Hours"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2862", "title": "Why Synchronous Checkpointing Breaks at 11.25-Minute System MTBF", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Determine the system's Mean Time Between Failures (MTBF) and explain why standard synchronous checkpointing is unviable here?", "visual": {"kind": "svg", "path": "cloud-2862.svg", "alt": "A horizontal bar chart timeline illustrating the extreme overlap of checkpointing, compute, and recovery times when system MTBF is nearly equal to checkpoint duration.", "caption": "Synchronous Checkpointing Overhead Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2870", "title": "Gradient Sync Bucket Spec", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What measurements are needed before tuning gradient bucket size for communication-computation overlap?", "chain_ids": ["cloud-chain-auto-023-13"], "chain_positions": {"cloud-chain-auto-023-13": 2}, "chain_tiers": {"cloud-chain-auto-023-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2878", "title": "Estimating Activation Memory for a Transformer Layer on H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How much activation memory will the forward pass consume across all 72 layers, and will it fit alongside fp16 70B weights?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 2}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2879", "title": "Designing a Gradient Checkpointing Segment Strategy for a 70B Model on A100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Given a 40 GB/GPU uncheckpointed activation footprint, what gradient-checkpointing strategy keeps activations under 28 GB/GPU while minimizing recomputation for this fine-tune?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 1}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2880", "title": "Designing Activation Offloading vs. Checkpointing Trade-off for Long-Sequence Training on H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Should you use checkpointing every 4 layers or PCIe 5.0 activation offload to keep the 32K-token step under 2 seconds, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2881", "title": "Evaluating Gradient Checkpointing Overhead on Training Throughput for GPT-Scale Models", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is the 21.6% throughput loss from checkpointing acceptable for the OOMing 13B job, and does it match theoretical recompute overhead?", "chain_ids": ["cloud-chain-auto-027-24"], "chain_positions": {"cloud-chain-auto-027-24": 2}, "chain_tiers": {"cloud-chain-auto-027-24": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2882", "title": "Evaluating Activation Memory Profiler Output for Silent Memory Fragmentation", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate whether activation memory fragmentation is the cause and determine a remediation path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2883", "title": "Custom MoE Checkpointing by Recomputing Router Logits", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you implement fine-grained MoE checkpointing so gating logits are discarded and only routing is recomputed in backward?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2884", "title": "Deriving Optimal Checkpoint Intervals", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the largest feasible checkpoint interval k under an 80GB HBM budget, assuming peak activation memory within a checkpoint segment is k layers times 3GB per layer?", "chain_ids": ["cloud-chain-auto-027-24"], "chain_positions": {"cloud-chain-auto-027-24": 3}, "chain_tiers": {"cloud-chain-auto-027-24": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2885", "title": "Optimizing FlashAttention vs. Standard Attention Activation Footprint at Scale", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much activation memory does FlashAttention save at S=8192, and does it eliminate the need for gradient checkpointing within an 80 GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2886", "title": "Optimizing Activation Memory for Pipeline-Parallel Training Across A100 Nodes", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does the first 1F1B pipeline stage use 62 GB of activations while the last stage uses so much less memory, and how would you reduce the imbalance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2887", "title": "Realizing Activation Checkpointing in a ZeRO-3 + Tensor-Parallel Hybrid Configuration", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What happens to the 22 GB/GPU activation memory when DeepSpeed activation checkpointing is enabled, and how does it interact with TP=4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2888", "title": "When Can KV Cache Exceed Training Activations?", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is inference always lower-memory than training at the same batch and sequence length, and when can KV cache exceed training activations?", "chain_ids": ["cloud-chain-auto-027-26"], "chain_positions": {"cloud-chain-auto-027-26": 4}, "chain_tiers": {"cloud-chain-auto-027-26": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2889", "title": "Specifying Activation Memory Budget for a Multi-Tenant Training Cluster on H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What activation memory allocation policy should the scheduler enforce for jobs across model size, sequence length, batch, and checkpointing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2890", "title": "Specifying Gradient Checkpointing Requirements in a Model Training SLA", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What checkpointing and sharding configuration satisfies the 90% HBM, 350 tokens/sec/GPU, and 500B-token-in-60-days constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2891", "title": "Analyze depthwise separable convolution parameter reduction on ResNet-50 baseline", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What FLOP speedup would replacing 3x3 convs with depthwise-separable layers give, and is that alone enough to justify retraining?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 2}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2892", "title": "Analyze EfficientNet compound scaling vs. naive width/depth scaling on throughput", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which of EfficientNet-B4, widened ResNet-50, or deepened ResNet-101 has the best accuracy-per-FLOP and throughput?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 1}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2893", "title": "Analyze inverted residual bottleneck memory access patterns on A100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the MobileNetV2 expand layer the bottleneck for 112x112x32 inputs on A100, or are depthwise/project layers more bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2894", "title": "Design an efficient CNN serving pipeline for H100 multi-tenant inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a batching and memory management strategy that maximizes single-GPU utilization while keeping p99 latency under 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2895", "title": "CNN Pareto Frontier and A100 Serving Cost", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the model-selection experiment to find the FLOP-accuracy Pareto frontier and serving cost for 200K QPS on 32 A100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2896", "title": "Design EfficientNetV2 training pipeline for large-scale cloud fine-tuning", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What distributed training, gradient accumulation, and memory plan would let EfficientNetV2-XL fine-tune on 50M images in under 48 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2897", "title": "Diagnose low GPU utilization when serving MobileNetV3 on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is MobileNetV3-Large only reaching 12% SM utilization at 5,000 QPS batch=1 on H100, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2898", "title": "Diagnose accuracy degradation after replacing standard convs with depthwise separable", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing ResNet-34 3x3 convolutions with depthwise separable convolutions hurt ImageNet accuracy, and how can it be recovered?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2899", "title": "Diagnose NaN losses during EfficientNet training with aggressive augmentation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What failure chain could make EfficientNet-B5 with BF16 and Mixup diverge at SE blocks, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2900", "title": "Evaluate MobileNetV3 vs EfficientNet-B0 accuracy-latency tradeoff for A100 batch serving", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which model gives better cost per unit accuracy under a 30 ms p99 A100 TensorRT INT8 SLA: MobileNetV3-Large or EfficientNet-B0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2901", "title": "Evaluate EfficientNet compound scaling coefficient impact on H100 memory bandwidth", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "As EfficientNet scales from B1 to B4 on H100, where do memory pressure and accuracy gains begin to show diminishing returns?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 2}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2902", "title": "Recall depthwise separable convolution FLOP formula", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How do you compute the FLOPs for a standard 3x3 convolution versus a depthwise separable convolution with Cin=Cout=128 at 28x28, and what is the reduction ratio?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2903", "title": "Recall MobileNetV2 inverted residual structure and why it inverts the bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the architectural difference between a ResNet bottleneck and a MobileNetV2 inverted residual block, and why use a linear bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2904", "title": "Recall EfficientNet compound scaling constraints and baseline architecture", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does EfficientNet's compound scaling differ from simply widening or deepening a baseline model like B0, and what do the coefficients represent?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 0}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2905", "title": "Implement INT8 quantization for MobileNetV3 on TensorRT with calibration", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calibrate and convert MobileNetV3-Large to TensorRT INT8 on A100 and verify the accuracy drop stays below 1%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2906", "title": "Optimizing Depthwise Convolution with Shared Memory on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you optimize the C=512, 14x14, batch-32 depthwise 3x3 layer on H100 beyond cuDNN's 18% SM utilization?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 3}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2907", "title": "Implement channel pruning for EfficientNet-B3 to reduce serving costs", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you prune EfficientNet-B3 by 40% serving cost while retaining at least 81% top-1, and how would you validate it in TensorRT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2908", "title": "Mastery: explain why inverted residuals outperform standard bottlenecks on memory-bandwidth-limited hardware", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would a roofline model argument using arithmetic intensity explain why inverted residuals outperform standard bottlenecks on bandwidth-limited hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2909", "title": "Mastery: trade-off analysis of SE block overhead vs accuracy gain in EfficientNet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the total SE-module FLOP overhead in EfficientNet-B0, and would removing SE plus width scaling lower H100 cost at the same accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2910", "title": "Mastery: knowledge distillation from EfficientNet-B7 to MobileNetV3 at scale", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you distill EfficientNet-B7 into MobileNetV3-Large, and why is intermediate feature distillation needed beyond soft labels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2911", "title": "Optimize depthwise separable convolution throughput via kernel fusion on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you fuse kernels in the MobileNetV2 expand-depthwise-project block to reduce 47 launches and memory transactions on H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2912", "title": "Optimize EfficientNet training throughput with gradient checkpointing and mixed precision", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What speedup and throughput do batch=256, BF16, selective checkpointing, and CUDA graphs provide, and does it meet the 1,200 images/sec/GPU target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2913", "title": "Choose a MobileNetV3-Large width multiplier for a 2ms batch-128 latency budget", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What MobileNetV3-Large width multiplier best improves accuracy while keeping batch-128 latency under the 2ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2914", "title": "Realize end-to-end EfficientNet-B4 serving with DALI preprocessing on H100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you build the GPU preprocessing and inference pipeline for EfficientNet-B4 to serve 2,000 QPS of 1080p JPEGs on one H100 node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2915", "title": "Realize MobileNetV2 export pipeline from PyTorch to ONNX to TensorRT", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the complete export chain from PyTorch to TensorRT FP16, and what are the known failure points at each step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2916", "title": "Realize automated NAS-style width and depth search for EfficientNet-style backbone", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you realize a once-for-all NAS pipeline under the budget, and what is the maximum per-epoch time constraint if 22,500 seconds is allocated to training?", "chain_ids": ["cloud-chain-auto-secondary-002-06"], "chain_positions": {"cloud-chain-auto-secondary-002-06": 3}, "chain_tiers": {"cloud-chain-auto-secondary-002-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2917", "title": "Specification: define latency-accuracy SLA for efficient CNN cloud API", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What hardware, model, batching, and failure-mode spec would you choose to serve 50K QPS under the p50, p99, and >80% ImageNet SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2918", "title": "Fluency: explain depthwise separable convolution to a non-ML systems engineer", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do depthwise separable convolutions differ from standard convolutions, why are they faster, and how can they change model behavior?", "chain_ids": ["cloud-chain-auto-secondary-002-05"], "chain_positions": {"cloud-chain-auto-secondary-002-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-002-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2919", "title": "RAG Pipeline Latency Budget on H100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the 380 ms RAG TTFT being spent, which stage is the bottleneck, and what changes get P95 under 300 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2920", "title": "Router Model Accuracy vs Latency Tradeoff in Multi-Model Pipeline", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is 91% router accuracy sufficient for the GPT-4-class versus 7B routing setup, given expected latency, P95, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2921", "title": "Explaining Compound AI Pipeline Behavior to Non-Technical Stakeholders", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why can the 5-stage agentic RAG pipeline give factual errors even when each stage's metrics look healthy in isolation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2922", "title": "Mastering KV Cache Sharing Across Agents in Multi-Model Orchestration", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "If the optimized pipeline costs 42% of the original baseline, what is the actual cost reduction, and why might someone incorrectly claim 37% savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2923", "title": "End-to-End Latency SLA for Multi-Hop Agent Chains on A100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which hops in the 4-hop agent chain can be parallelized, and what P95 latency is achievable after restructuring toward the 500 ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2924", "title": "Memory Capacity Planning for Concurrent RAG Agents on H100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Do the 16 concurrent 13B agent sessions plus shared 7B reranker fit on one 8xH100 node, and how would you allocate memory with tensor parallelism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2925", "title": "Cold Start Latency for Serverless Compound AI on A100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What keep-warm strategy would reduce the 18s cold start for the bursty serverless deployment without paying for always-on GPU time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2926", "title": "Implementing Semantic Caching for RAG on H100 to Reduce LLM Calls", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a semantic cache for the 50,000-query/day RAG system, and how should it invalidate entries after 6-hour KB updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2927", "title": "K8s GPU Device Plugin: Why Pod Requests Whole GPUs", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does nvidia.com/gpu: 1 allocate an entire GPU, how does the device plugin expose GPUs, and why aren't fractions default?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 0}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2928", "title": "Specifying a Carbon-Aware Cooling Control Policy for H100 Clusters", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you specify a cooling control policy that dynamically switches between cooling modes based on both ambient temperature and grid carbon intensity to minimize operational carbon footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2929", "title": "Designing a Volcano Job for Multi-Pod Distributed Training", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What Kubernetes scheduling configuration prevents partial-allocation deadlock for 64-GPU PyTorch DDP jobs sharing the H100 cluster with inference workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2930", "title": "Set Kubernetes memory limits and readiness probes to prevent vLLM OOMKilled restarts", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What Kubernetes resource and readiness configuration would reduce vLLM OOMKilled restarts on A100 nodes without over-provisioning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2931", "title": "Implement a Kubernetes scheduler plugin for H100 GPU topology", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you implement a Kubernetes scheduler extension that scores H100 nodes by GPU-to-GPU bandwidth topology for the 4-node training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2932", "title": "Evaluating Gang Scheduling vs. Elastic Training Tradeoffs", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For week-long fine-tunes on the 128-H100 cluster with a 5% weekly node failure rate, should you use Volcano gang scheduling or elastic training, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2933", "title": "Evaluating Horizontal Pod Autoscaler for GPU Inference Scaling", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is HPA on GPU utilization the right autoscaling signal for bursty 2–3 minute vLLM traffic spikes, and what should you use instead?", "chain_ids": ["cloud-chain-auto-021-13"], "chain_positions": {"cloud-chain-auto-021-13": 2}, "chain_tiers": {"cloud-chain-auto-021-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2934", "title": "Fluency: Translating a PyTorch DDP Job into a Kubernetes Manifest", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you translate the 4-node, 8-GPU-per-node torchrun command into a Training Operator PyTorchJob manifest?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2935", "title": "Realizing Cluster Autoscaler Behavior with GPU Node Groups", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do GPU nodes in the EKS cluster take 8–12 minutes to join after a pod goes pending, and what steps make up that delay?", "chain_ids": ["cloud-chain-auto-021-13"], "chain_positions": {"cloud-chain-auto-021-13": 0}, "chain_tiers": {"cloud-chain-auto-021-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2936", "title": "Realizing Why GPU Jobs Hang After Node Preemption", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is happening when the job hangs after preemption, and how would you detect this failure proactively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2937", "title": "Analyzing Pod Scheduling Latency Under Quota Pressure", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Team A’s GPU pods take 15–20 minutes to schedule while Team B’s schedule quickly despite both being within quota?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2938", "title": "Analyzing DCGM Exporter Metric Blast Radius in Multi-Tenant GPU Clusters", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the blast radius of the cluster-wide DCGM Exporter, and how would you design RBAC and GPU metric isolation between namespaces?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2939", "title": "Mastery: Designing a Zero-Downtime GPU Cluster Upgrade Strategy", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you roll out the NVIDIA 525-to-550 driver upgrade across 64 H100 nodes without >5 minutes inference downtime or lost training progress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2940", "title": "Recall: What PUE Measures and Why 1.0 Is Physically Impossible", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why is a PUE of 1.3 targeted instead of 1.0 for the datacenter, and is a lower PUE always achievable or better?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 0}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2941", "title": "Recall: Rack Power Budget Limits for H100 Dense Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Can a standard 15 kW rack feasibly host two 10.2 kW DGX H100 nodes, and what datacenter constraints apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2942", "title": "Realizing How Liquid Cooling Changes PUE for H100 Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Where would direct liquid cooling save power versus the current PUE 1.55 air-cooled H100 cluster, and what infrastructure changes are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2943", "title": "Realizing Carbon-Aware Scheduling: When to Shift GPU Workloads", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should carbon-aware scheduling route the 7-day, 512-GPU H100 training job between US-West and US-East, and what grid-signal infrastructure is needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2944", "title": "Realizing Power Capping vs. Thermal Throttling on H100", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might a 500 W power cap cause a 30% throughput drop instead of the expected 15%, and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2945", "title": "Implementing Power-Aware Bin Packing for Heterogeneous GPU Racks", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a power-aware placement algorithm that maximizes H100 and A100 GPU density per rack without exceeding 20 kW at P95 load?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 3}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2946", "title": "Implementing Carbon-Aware Job Queue with WattTime Integration", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you implement a WattTime-driven Kubernetes scheduler that cuts Scope 2 emissions 30% without increasing average job latency over 20%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2947", "title": "Evaluating PUE Measurement Methodologies: Instantaneous vs. Annual", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which PUE quote is more meaningful for H100 hosting, Vendor A’s instantaneous 1.2 or Vendor B’s ISO annual 1.45, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2948", "title": "Evaluating Free Cooling Feasibility for H100 Liquid-Cooled Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is year-round free cooling viable for a DLC-equipped H100 cluster in Austin with <= 45°C inlet water, and how many hours need mechanical assist?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2949", "title": "Evaluating Carbon Cost of US-East GPUs vs TPU v5e for LLM Training", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the total carbon footprint of pre-training the 7B LLM on US-East GPUs versus Google TPU v5e, and which option is lower-carbon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2950", "title": "Diagnosing Unexpected PUE Spike in H100 Cluster", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically diagnose a weekend PUE spike from 1.35 to 1.72 when H100 GPU utilization stayed at 85% and cooling had no alerts?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 2}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2951", "title": "Optimization: Reducing Stranded Power Through Dynamic GPU Power Capping", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you dynamically power-cap the 128-GPU A100 cluster so all GPUs stay powered while total IT power remains under 45 kW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2952", "title": "Specification: Designing Rack Power Distribution for MI300X Dense Cluster", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What rack layout, PDU configuration, circuits, and breaker sizing would you specify for the 64-node MI300X cluster with N+1 redundancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2953", "title": "Dataset Curation: Why Data Quality Degrades Model Calibration on H100", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can near-duplicate web-scraped text cause ECE > 0.15 in the 70B LLM, and what data statistics would reveal the problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2954", "title": "Dataset Curation: Design a Multi-Stage Quality Pipeline for LLM Pre-Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a multi-stage pipeline to filter 50 TB of raw Common Crawl into about 5 TB of high-quality tokens within 48 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2955", "title": "Dataset Curation: Design a Labeling Pipeline for Multi-Modal Training Data", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you label 100M image-text pairs to exceed 95% accuracy in 30 days with 10 human annotators and access to GPT-4V?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2956", "title": "Dataset Curation: Evaluate Deduplication Strategies for Pre-Training Data Quality", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 20 TB pre-training dataset use exact URL dedup or 0.8-Jaccard MinHash near-dedup, given quality and runtime tradeoffs?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2957", "title": "Dataset Curation: Evaluate Domain Mixing Ratios for Instruction Tuning", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which mixing strategy should you use for the 500K code, 200K instruction, and 50K math examples, and how would you evaluate forgetting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2958", "title": "Dataset Curation: Napkin Math for Tokenizer Coverage on Multilingual Data", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What tokenizer fertility should you expect after adding 20% non-English data to an English-only 32K tokenizer, and how does it affect sequence length and memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2959", "title": "Dataset Curation: Napkin Math for Data Mixing Budget", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many of the 1.3T Chinchilla-optimal tokens should come from web, books, code, and papers after applying proportional mixing with a 2x quality multiplier for books?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2960", "title": "Size a KenLM Perplexity Scorer for 1B Documents in 6 Hours", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design and size a KenLM perplexity scorer to process 1B documents within 6 hours on a 16-node CPU cluster?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2961", "title": "Dataset Curation: Mastery — End-to-End Pre-Training Data Strategy for 100B Model", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What complete data curation strategy (sourcing, filtering, deduplication, mixing) is required, and what throughput must the pipeline sustain?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 3}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2963", "title": "Dataset Curation: Mastery — Data Flywheel Architecture for Production LLM", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a weekly data flywheel that turns 50M daily user queries into private, high-quality, fresh training data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2964", "title": "Fix an NFS DataLoader Bottleneck in a 256-H100 Training Job", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is starving the 256-H100 training job, and how would you fix the DataLoader and storage path to recover the expected MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2965", "title": "Dataset Curation Optimize Deduplication Pipeline Scalability", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the bottleneck in the MinHash dedup pipeline, and how would you complete the 5T-token run in under 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2966", "title": "Dataset Curation: Realize Token Budget for Chinchilla-Optimal 30B Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the full data requirements and DataLoader throughput needed for a Chinchilla-optimal 30B model on 64 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2967", "title": "Dataset Curation: Realize Multi-Modal Dataset Storage and Access Architecture", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage, indexing, and DataLoader architecture would sustain training on 1B image-text pairs for the 20B vision-language model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2968", "title": "Dataset Curation: Recall — What is Perplexity-Based Data Filtering?", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is perplexity-based filtering, and what threshold range is commonly used to classify low-quality versus high-quality documents?", "chain_ids": ["cloud-chain-auto-secondary-005-06"], "chain_positions": {"cloud-chain-auto-secondary-005-06": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2969", "title": "Dataset Curation: Specification — Design a Data Quality SLA for Continuous Pre-Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What quantitative SLAs should gate the 100M weekly documents for data quality, freshness, deduplication, PII, and pipeline throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2970", "title": "Fault Tolerance: Analyze Checkpoint Frequency vs Recovery Cost on H100 Cluster", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes expected lost time for the 70B training job with 8-minute checkpoints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2971", "title": "Fault Tolerance: Design Checkpointing Architecture for 1024-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What checkpointing strategy minimizes interruption for a 175B model on 1,024 GPUs while preserving full recovery capability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2972", "title": "Fault Tolerance: Evaluate Full vs Incremental Checkpointing for LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 2-week 30B run, how do full 30-minute checkpoints compare with 10-minute incremental checkpoints, and what strategy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2973", "title": "Fault Tolerance: Evaluate ZeRO vs Full Replica Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For ZeRO-3 on 256 H100s, should checkpointing save each rank's shard or gather the full model on rank 0, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2974", "title": "Fault Tolerance: Implement Checkpoint Size Calculation for Mixed-Precision Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the per-rank and total checkpoint sizes for the 13B ZeRO-2 job, and do they fit within 100GB of NVMe per node?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2975", "title": "Design Fault Tolerance for a 500B Model on 2,048 GPUs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What fault-tolerance architecture for checkpointing, detection, recovery, and spares will keep the 500B, 2,048-GPU run above 95% uptime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2976", "title": "Fault Tolerance: Optimize Checkpoint Write Throughput on Lustre Filesystem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Diagnose the Lustre bottleneck and explain how to tune it to reach at least 120 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2977", "title": "Fault Tolerance: Specification — Define Recovery Time Objective for LLM Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What RTO, RPO, checkpoint cadence, storage budget, and recovery SLAs would you set for the 90-day 512-GPU training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2978", "title": "Feature Store: Design a Low-Latency Feature Store for Real-Time LLM Serving", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a feature store architecture to serve 100K requests/second with 500 features per request at < 5ms p99 latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2979", "title": "Feature Store: Diagnose Feature Staleness Causing Model Accuracy Degradation", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What root cause explains the CTR drop when a feature has zero variance, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2980", "title": "Feature Store: Evaluate Online vs Offline Feature Store Architectures", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the daily revenue impact of a 2% metric degradation due to training-serving skew if the system processes 100K RPS with an average value of $0.01 per prediction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2981", "title": "Feature Store: Evaluate Point-in-Time Correct Feature Joins for Training Data", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is a latest-value feature join risky for the fraud dataset, and how does a point-in-time correct join avoid data leakage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2982", "title": "Redis Feature Store Memory and Bandwidth Sizing", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much Redis memory and network bandwidth are needed for 10M users, 5M catalog items, and 50K requests per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2983", "title": "Low-Latency Multi-Modal Feature Store for LLM Serving", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What feature store architecture meets < 3ms p99 for 200K requests/s with text embeddings, session context, and image embeddings?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 4}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2984", "title": "Nightly Feature Pipeline Incremental Aggregation Optimization", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you optimize the 32-GPU nightly feature pipeline from 6 hours to under 2 hours when 70% of time is in 5 aggregations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2985", "title": "Feature Store: Realize Feature Store Sizing for Production ML Platform", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What complete feature store infrastructure would you build for 10 model families, 500M users, 1M catalog items, and 500K requests/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2986", "title": "Feature Store: Specification — Define Feature Freshness SLA for Real-Time Fraud Detection", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What feature freshness SLAs, latency budgets, and monitoring would you set for the 50K TPS fraud system's three feature tiers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2987", "title": "Kernel Fusion: Recall — What is Operator Fusion and Why Does It Matter?", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is kernel fusion, which HBM bottleneck does it address, and what bandwidth-saving formula applies?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2988", "title": "Kernel Fusion: Design a Fused Attention Kernel for H100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For the 4096x64x128 attention layer, how large are Q, K, V and the full score matrix, and why is FlashAttention tiling mandatory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2989", "title": "Kernel Fusion: Evaluate FlashAttention vs Standard Attention on H100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 1K, 4K, and 16K context lengths, how do FlashAttention-2 and standard attention differ in memory use, throughput, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2990", "title": "Kernel Fusion: Evaluate Fused vs Unfused MLP Blocks", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which parts of the 13B MLP block are compute-bound versus memory-bound, and how much does fusion help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2991", "title": "Kernel Fusion: Implement Fused LayerNorm + Linear on H100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For batch=1 decode, how much HBM traffic does fusing LayerNorm + Linear save for hidden size 4096, and what latency gain should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2992", "title": "Kernel Fusion: Mastery — Design Fusion Strategy for High-Throughput LLM Prefill", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the top fusion opportunities to improve the 38% MFU for 70B LLM prefill on 8 GPUs, and what is the expected MFU improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2993", "title": "Kernel Fusion: Mastery — Fusion Impact Analysis for Autoregressive Decode", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 7B batch=1 decode, what fusion strategy is useful, and are the gains from bandwidth reduction or kernel-launch overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2994", "title": "Kernel Fusion: Optimize CUDA Kernel Fusion for Transformer LayerNorm", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a 256x2048x6144 BF16 LayerNorm activation tensor, how many HBM bytes are transferred by naive three-pass versus fused LayerNorm, and how many bytes are saved per call?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2995", "title": "Kernel Fusion: Optimize Fusion Strategy for Multi-Query Attention", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do fused and unfused MQA kernels differ in decode mode for the 13B LLM, and how much KV-cache bandwidth does MQA save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2996", "title": "Decode Attention FLOP Estimation for a 70B LLM", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the correct per-layer attention FLOP count, and why is the 8.59 TFLOPs estimate fundamentally flawed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2997", "title": "Kernel Fusion: Realize Fusion ROI Analysis for Production LLM Serving", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the break-even time for implementing kernel fusion on this LLM serving system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2998", "title": "Kernel Fusion: Specification — Define Fusion Requirements for Custom CUDA Kernel", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What correctness, performance, precision, and validation requirements would you set for the fused RMSNorm + QKV CUDA kernel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-2999", "title": "KV-Cache: Design PagedAttention KV Cache for Multi-Tenant LLM Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a PagedAttention KV cache for 100 concurrent 128-4096-token requests on 8 H100s to maximize GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3000", "title": "Design Prefix Sharing KV Cache for System Prompt Optimization", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a prefix KV cache, and what TTFT savings would it provide?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3001", "title": "KV-Cache: Evaluate KV Cache Quantization Tradeoffs on H100", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do FP16, INT8, and INT4 KV caches compare for HBM usage, decode throughput, and quality on this 13B workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3002", "title": "KV-Cache: Evaluate PagedAttention vs Continuous Batching Scheduling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate GPU utilization, memory waste, and throughput for static preallocation versus PagedAttention continuous batching for this mixed workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3003", "title": "Per-GPU KV Cache Capacity for a 70B GQA Model", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the per-GPU HBM allocation for model weights, KV cache, and activations, and how many tokens can be cached simultaneously?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3004", "title": "Design KV Cache Infrastructure for 100K RPS LLM Platform", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What complete KV-cache strategy would you use to serve 100K RPS with 256-token inputs and 128-token outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3005", "title": "Long-Context KV Cache Budget for 13B Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you budget HBM to support at least 10 concurrent 32K-context requests for the 13B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3006", "title": "KV-Cache: Optimize KV Cache Eviction Policy for LLM Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What KV-cache memory per token should you use when designing the eviction policy for this 7B GQA-8 workload, and how does it inform the eviction strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3007", "title": "KV-Cache: Realize KV Cache Memory Layout for Tensor-Parallel 70B Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the 70B KV cache be laid out per GPU with TP=8 and 8 KV heads, and what capacity does it provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3008", "title": "KV-Cache: Realize Cross-Request KV Cache Sharing for RAG Applications", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV-cache memory per token should the shared RAG document cache budget for the 32-layer GQA-8 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3009", "title": "KV-Cache: Specification — Define KV Cache SLAs for Production LLM API", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What KV-cache size per token and per 4K-context request should the production 7B API budget to meet these requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3010", "title": "KV-Cache: Specification — KV Cache Budget for Multi-Model Serving", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For two GQA-8 models on a single H100 budgeting 128 KiB/token and up to 2048 active tokens per request, what KV-cache partitioning, priority policy, and overflow handling should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3011", "title": "KV-Cache: Specification — KV Cache Sizing for Speculative Decoding", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For the target model only, what per-token KV-cache budget is required, using the 70B target architecture assumption of 80 layers, GQA-8, head_dim=128, and fp16/bf16 KV precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3012", "title": "Dataset Curation: Evaluate Training Data Quality Metrics for LLM Fine-Tuning", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which metric better predicts downstream task performance: instruction-following rate or semantic diversity score?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3013", "title": "Dataset Curation: Realize Data Versioning Infrastructure for Continuous LLM Training", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the data versioning system to support weekly data arrivals, reproducible runs, rollback, A/B mixes, and audit trails?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3014", "title": "Latency Decomposition: Compare Batched vs. Streaming Inference on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do batched serving and individual continuous batching compare in end-to-end latency for 512-token prefill and 256-token decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3015", "title": "Latency Decomposition: Compare KV-Cache vs. No-Cache Decode Latency on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At sequence length 1024, how does per-token decode latency compare with and without a KV cache for the 7B model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3016", "title": "Latency Decomposition: Full-Stack E2E Latency Budget for LLM API on H100 Cluster", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you budget the P99 latency components, identify the dominant term, and optimize to fix an SLA miss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3017", "title": "Latency Decomposition: Diagnose and Fix Decode Latency Regression on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did memory utilization jump to 89% after moving from 13B to 20B, and how would you quantify the fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3018", "title": "Latency Decomposition: Size the Per-Component Latency for a 70B Model Serving Request", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What NVLink bandwidth and all-reduce overhead should be included in the 4-GPU tensor-parallel latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3019", "title": "Latency Decomposition: Specify Prefill/Decode Split Requirements for SLA on H100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What prompt length, decode length, GPU count, and batch-size constraints satisfy TTFT < 200ms and TPOT < 30ms for the 13B chat app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3020", "title": "MLOps Lifecycle: Design a CI/CD Pipeline for LLM Fine-Tuning", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What CI/CD pipeline would support nightly 7B fine-tuning, benchmark evaluation, 5% canaries, rollback under 5 minutes, and what are its storage and runtime budgets?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3021", "title": "MLflow vs. Weights & Biases Experiment Tracking Cost", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do self-hosted MLflow and W&B compare on operational overhead and total monthly cost for 1000 completed experiments?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3022", "title": "Spot Instance Fallback Cost for 16 H100 GPUs", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the correct extra cost of an on-demand fallback for the 16-GPU deployment for a one-hour interruption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3023", "title": "MLOps Lifecycle: Implement Model Registry Versioning Strategy for Multi-Region LLM", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design the model registry schema and cross-region replication strategy to deduplicate 13B artifacts and allow <5-minute rollback?", "chain_ids": ["cloud-chain-auto-011-09"], "chain_positions": {"cloud-chain-auto-011-09": 0}, "chain_tiers": {"cloud-chain-auto-011-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3024", "title": "MLOps Lifecycle: End-to-End MLOps System Design for Production LLM on H100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "In a serving capacity model for 10M requests/day, what does the value 57.9 represent?", "chain_ids": ["cloud-chain-auto-011-09"], "chain_positions": {"cloud-chain-auto-011-09": 2}, "chain_tiers": {"cloud-chain-auto-011-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3025", "title": "MLOps Lifecycle: Concretely Size Model Registry Storage for a Large ML Organization", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much registry storage is needed for 200 experiments per week over a year, and what tiering strategy minimizes cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3026", "title": "MLOps Lifecycle: Specify a Model Evaluation Gate for Production Promotion on H100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What quantitative promotion thresholds should automatically gate a new 13B checkpoint for benchmark, latency, memory, quality, and errors?", "chain_ids": ["cloud-chain-auto-011-09"], "chain_positions": {"cloud-chain-auto-011-09": 1}, "chain_tiers": {"cloud-chain-auto-011-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3027", "title": "H100 NVLink Bandwidth for Tensor Parallel Inference", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For H100 tensor parallel inference, what NVLink bandwidth should you use when estimating all-reduce communication overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3028", "title": "Model Size Estimation: Design Sharding Strategy for 405B Model on H100 Cluster", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What H100 NVLink bandwidth should the 405B sharding analysis use, and why is 600GB/s the wrong value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3029", "title": "Model Size Estimation: Design Quantization Strategy for Fitting 70B on Single H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What quantization and memory layout would fit a 70B LLaMA model on one 80GB GPU for low-latency inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3030", "title": "Comparing GPT-3 and LLaMA-2 Memory Footprints on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do GPT-3 175B and LLaMA-2-70B compare in inference memory for weights, KV cache, and activations at batch 16 and seq 2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3031", "title": "Model Size Estimation: Compare MoE vs. Dense Memory Footprint", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does Mixtral-8x7B inference memory compare with dense LLaMA-2-13B at batch 32 and sequence length 1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3032", "title": "Model Size Estimation for 7B Model Memory", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory does a 7B parameter model need at FP16 for inference with a batch of 8, generating up to 512 tokens?", "chain_ids": ["cloud-chain-auto-secondary-005-07"], "chain_positions": {"cloud-chain-auto-secondary-005-07": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3033", "title": "Model Size Estimation: Master Full Memory Budget for LLM Training on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the full GPU memory budget including weights, gradients, optimizer states, and activations, and does it fit on a single H100?", "chain_ids": ["cloud-chain-auto-secondary-005-07"], "chain_positions": {"cloud-chain-auto-secondary-005-07": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3034", "title": "Model Size Estimation: Master Attention KV-Cache Scaling Laws on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the KV-cache memory per token, max context capacity, and how does GQA (8 KV heads) change this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3035", "title": "Model Size Estimation: Diagnose OOM Error During Fine-Tuning", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 13B fine-tuning run OOM, and what minimum changes make it fit in 80GB?", "chain_ids": ["cloud-chain-auto-secondary-005-07"], "chain_positions": {"cloud-chain-auto-secondary-005-07": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3036", "title": "Diagnose KV-Cache Memory Leak on H100 Serving System", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing memory to grow from 35GB to 78GB over 4 hours, and how long until the 80GB H100 OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3037", "title": "Model Size Estimation: Realize Full Memory Layout for 13B Model Serving on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size the H100 memory budget for 13B LLaMA-2 FP16 serving and compute max concurrency at 1024 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3038", "title": "Model Size Estimation: Realize FP8 vs. FP16 Memory Comparison for 70B on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much H100 memory does a 70B LLaMA model need in FP16 vs FP8, and what KV-cache budget remains on 1, 2, and 4 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3039", "title": "Model Size Estimation: Realize Memory Budget for Multi-Modal LLM on H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does LLaVA-13B fit on one 80GB H100 at batch 8 and 4096 context, and what is the maximum batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3040", "title": "Compute-Bound Prefill Estimate for a 70B Model on One H100", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Can a 70B model prefill a 512-token prompt in 37 ms on an H100, and what is the correct compute-based estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3041", "title": "Network Bandwidth Bottlenecks: Design NVLink vs. PCIe Topology for 8xH100 Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "For TP-2 and DP-4 training on an 8xH100 DGX, which traffic uses NVLink versus PCIe, and how much bandwidth does each consume?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 3}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3042", "title": "Network Bandwidth Bottlenecks: Compare PCIe 4.0 vs. 5.0 Impact on H100 Data Loading", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the actual input tensor size for batch 256 ImageNet 224x224 RGB float32 images, how long would it take to transfer over PCIe 4.0 versus PCIe 5.0, and why is an estimate of 3.3GB too high?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 2}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3043", "title": "Network Bandwidth Bottlenecks: Compare AllReduce over NVLink vs. InfiniBand", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 13B model with 26GB FP16 gradients, how long does the AllReduce take with NVLink versus InfiniBand, and what is the efficiency ratio?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 1}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3044", "title": "Network Bandwidth Bottlenecks: Recall PCIe and NVLink Bandwidth Specs", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the exact bidirectional bandwidth specifications for PCIe 5.0, NVLink 4.0, and NVSwitch, and when does each limit performance?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 0}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3045", "title": "Network Bandwidth Bottlenecks: Implement Ring-AllReduce Time Formula for H100 Cluster", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact Ring-AllReduce time for the 13B model on 8 H100s, and is it compute-bound or communication-bound if step time is ~100ms?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 0}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3046", "title": "3D-Parallel TP PP DP Communication Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the specific TP, PP, and DP communication costs per step, and which dominates the 3D-parallel setup?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 3}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3047", "title": "32-GPU 7B DDP: NVLink vs 200Gb/s InfiniBand AllReduce Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 32-GPU 7B data-parallel job, is NVLink or 200Gb/s InfiniBand the bottleneck, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3048", "title": "Network Bandwidth Bottlenecks: Realize AllReduce Data Volume for 70B Model Training", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 70B training on 64 H100s using TP-8 and DP-8, how much gradient data does each GPU AllReduce and how long does it take over 200Gb/s InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3049", "title": "PCIe and NVMe Checkpoint Bottleneck for 70B Models", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does checkpointing a 70B FP16 model from 8 GPUs through PCIe 5.0 to NVMe take, and can it be hidden during training?", "chain_ids": ["cloud-chain-auto-027-07"], "chain_positions": {"cloud-chain-auto-027-07": 1}, "chain_tiers": {"cloud-chain-auto-027-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3050", "title": "Network Bandwidth Bottlenecks: Specify InfiniBand Fabric for 256-GPU LLM Training Cluster", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What InfiniBand topology, rail count, per-GPU bandwidth, AllReduce target, and bisection bandwidth would you use for a 256-GPU 70B+ training cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3051", "title": "Network Bandwidth Bottlenecks: Fluency — AllReduce Bandwidth in 60 Seconds", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the AllReduce take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3052", "title": "13B Training on 16 GPUs: TP-2 and DP-8 AllReduce Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What communication schedule and optimizations are needed for 13B training on 16 GPUs across two nodes to reach over 90% GPU utilization?", "chain_ids": ["cloud-chain-auto-027-08"], "chain_positions": {"cloud-chain-auto-027-08": 2}, "chain_tiers": {"cloud-chain-auto-027-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3053", "title": "Per-GPU Memory Budget for 7B Fine-Tuning with ZeRO-2 on 4 GPUs", "topic": "model-size-estimation", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the per-GPU memory budget for fine-tuning a 7B model on 4 GPUs with ZeRO-2, and does it fit in 80GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3054", "title": "Pipeline Parallelism Bubble Overhead Comparison: GPipe vs PipeDream", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 4-stage pipeline with 16 micro-batches, what are the GPipe and PipeDream-Flush bubble overheads, and which should train a 32B model?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 2}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3055", "title": "Pipeline Parallelism Bubble Overhead Comparison: Stage Count Tradeoff", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which configuration is better for throughput (4-stage or 8-stage), and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3056", "title": "Implement Pipeline Parallelism Bubble Fraction Formula", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For p=8 pipeline stages, how many micro-batches keep bubble overhead below 5%, and what GPipe activation memory does that create per H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3057", "title": "TP-PP-DP Sizing for 530B Training on 512 H100s", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you choose TP, PP, and DP degrees for a 530B model on 512 H100s, and what are the resulting bubble overhead and memory per device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3058", "title": "Pipeline Parallelism Optimization: Diagnosing Bubble Waste and Fixing", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is the 8-stage GPipe job with m=8 only 71% utilized, and how much would 1F1B with m=64 improve utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3060", "title": "Pipeline Parallelism Realization: Memory per Stage for LLaMA-70B", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 8-stage pipeline parallel LLaMA-70B training on 80GB H100s with DP=1, how much memory does each stage need, does it fit, and what headroom remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3061", "title": "Pipeline Parallelism Realization: Activation Memory with Gradient Checkpointing", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much activation memory is saved by checkpointing per stage, and does it fit in an 80GB H100 without it?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 1}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3062", "title": "Pipeline Parallelism Recall: GPipe Bubble Overhead Formula", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the GPipe bubble fraction formula, its variables, its m→∞ limit, and the value for p=4, m=16 on H100s?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 0}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3063", "title": "Pipeline Parallelism Specification: Design for <10% Bubble with Memory Constraint", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "With fixed TP=8 and PP=8 on 512 GPUs, what DP and smallest micro-batch count meet the 70GB memory and <10% bubble targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3064", "title": "Pipeline Parallelism Specification: Inter-Stage Communication Budget", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "For a 16-stage H100 pipeline training a 1T model, what activation and gradient volume crosses each stage boundary and is 400Gb/s InfiniBand sufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3065", "title": "Pipeline Parallelism Fluency: Rapid Bubble Estimation", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What are the estimates for the bubble fraction (p=8, m=56), the micro-batch count for <5% bubble (p=4), and the throughput lost to a 12% bubble on a 500 TFLOP/s cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3066", "title": "Queueing Theory Recall: Little's Law in Inference Systems", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the throughput of a server with 50 concurrent requests and a 2-second service time according to Little's Law, and how does it change if latency doubles?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 0}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3067", "title": "Queueing Theory Analyze: Why Tail Latency Explodes at High Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At 89% inference utilization, how does M/M/1 queueing explain P99 latency far above P50, and what happens at 95% utilization?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 2}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3068", "title": "Queueing Theory Design: Autoscaling Policy for P99 SLO", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using a per-H100 M/M/1 approximation, what safe utilization, scale trigger, and fleet size are needed for 1000 req/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3069", "title": "Queueing Theory Design: Multi-Server M/M/c for Inference Cluster", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using Erlang C, what minimum c keeps estimated P99 queue delay plus mean service under 200ms, and how does one server compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3070", "title": "Queueing Theory Evaluation: FCFS vs Priority Queuing for Mixed Workloads", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Compare FCFS and preemptive short-job priority for class-specific P99; what happens to aggregate P99 compliance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3071", "title": "Queueing Theory Evaluation: Continuous vs Batch Inference Latency", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What H100 FP16 TFLOPS specification should be used for LLaMA-13B serving throughput calculations, and how does continuous batching improve latency over static batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3072", "title": "Queueing Theory Evaluation: M/M/1 vs M/D/1 for Deterministic Service", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the correct M/D/1 mean queue wait formula and value at 80% utilization for 100 ms deterministic service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3073", "title": "Queueing Theory Fluency: M/M/1 Metrics from Memory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For an M/M/1 queue at ρ=0.95 and μ=40 req/s, what is the correct Wq formula and wait time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3074", "title": "Queueing Theory Implement: Erlang-C for Inference Cluster Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using Erlang-C with λ=200 req/s and μ=100 req/s per server, how many servers are needed for P99 500ms and what is P(wait0)?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 2}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3075", "title": "Queueing Theory Mastery: Inference System Capacity Planning End-to-End", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How many 4-GPU pods are required to support the 5000 req/s workload, and what is the estimated hourly cluster cost at $2/hr per GPU?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 3}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3076", "title": "Speculative Decoding for LLaMA-70B: M/M/1 vs M/G/1 P99 Latency", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does speculative decoding change M/M/1 to M/G/1 queueing for LLaMA-70B, and what approximate P99 improvement should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3077", "title": "Queueing Theory Optimization: Reduce P99 by 3x Without Adding Hardware", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which no-new-hardware operational changes would bring P99 below 2s, and how much P99 improvement does each provide?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3078", "title": "Queueing Theory Optimization: Tail Latency via Load Shedding", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At λ=45 and μ=50, is M/M/1/K load shedding needed to meet P99 < 1s, and what rejection fraction maximizes throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3079", "title": "Queueing Theory Realization: Size an H100 Inference Queue for Black Friday Traffic", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many H100s are needed for the 2000 req/s Black Friday burst, what does the 30-minute burst cost, and should you pre-warm capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3080", "title": "Queueing Theory Realization: Queue Memory Sizing for KV Cache Pooling", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the context distribution, what is mean KV-cache memory per active request and how many requests can be KV-resident before OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3081", "title": "Queueing Theory Realization: Compute vs Memory Bottleneck in Queued Requests", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is an H100 serving Mistral-7B decode compute-bound or memory-bound, and how does this dictate using an M/D/1 instead of an M/M/1 queueing model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3082", "title": "Queueing Theory Specification: Design a Latency SLO Budget", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you allocate the 500ms P99 latency budget across the load balancer, router, and GPU tiers, and size each M/M/c tier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3083", "title": "Systolic Array Analyze: Why TPU v5e Outperforms GPU for Matrix Multiply", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a TPU v5e often outperform an H100 for BERT-large inference despite a 5x lower peak FLOP rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3084", "title": "Systolic Array Design: Weight-Stationary vs Output-Stationary Tradeoff", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "For the 2048-token attention layer Q*K^T on a 256x256 systolic array, should you use weight-stationary or output-stationary dataflow, and why?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 3}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3085", "title": "Systolic Array Design: Tiling Strategy for Large Matrix on TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tile sizes, tile counts, and HBM traffic are needed for the 16384x16384 FFN GEMM on a 128x128 array?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3086", "title": "Systolic Array Evaluation: TPU v5e vs H100 for Training Transformers", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the H100 FP16 ridge point, and when do decode and prefill become compute- versus memory-bound based on arithmetic intensity?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3087", "title": "Systolic Array Evaluation: Batched vs Streaming Inference on TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For ResNet-50 on TPU v5e, what are throughput and latency for batch=128 versus batch=1, and which mode fits interactive vs batch serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3088", "title": "Systolic Array Fluency: Arithmetic Intensity from Memory", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the arithmetic intensities for the GEMM, ReLU, and QK^T cases, and which are compute- or memory-bound?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3089", "title": "Systolic Array Implement: GEMM Performance on TPU v5e", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the GEMM time and bottleneck, the MFU at 8.5ms measured time, and the bandwidth impact of 4-way tiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3090", "title": "Systolic Array Mastery: Full Transformer Layer Analysis on TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do the attention and FFN FLOPs and bandwidth break down for this transformer layer, what is the bottleneck, and what tile/dataflow should XLA use?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 4}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3091", "title": "TPU v5e Roofline Analysis for Batch-1 LLM Decode", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the 12 TFLOPS decode kernel compute- or memory-bound, what is the attainable performance, and which optimization gives the largest gain?", "chain_ids": ["cloud-chain-auto-secondary-010-24"], "chain_positions": {"cloud-chain-auto-secondary-010-24": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3092", "title": "Systolic Array Optimization: Fix Low MFU on TPU v5e", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is BERT-large at only 7.6% MFU on TPU v5e, and how much speedup should fusing GeLU, LayerNorm, and Linear deliver?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3093", "title": "Attention Layer Systolic Array Tile Count and Runtime", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory and compute time does this attention layer need, and how many 128x128 systolic-array tiles are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3094", "title": "Systolic Array Realization: Memory Layout for TPU Batch Inference", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which weight layout should TPU v5e use for the 8192x32768 feed-forward layer, and how much does layout affect bandwidth and performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3095", "title": "TPU v5e FFN Batch-Sequence Size for High Utilization", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What batch * sequence size is needed for the FFN layer to be compute-bound at ≥99% utilization on TPU v5e, and how do you derive it?", "chain_ids": ["cloud-chain-auto-secondary-010-24"], "chain_positions": {"cloud-chain-auto-secondary-010-24": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3096", "title": "TCO Analyze: Why Cloud GPUs May Be Cheaper Than On-Prem for Startups", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "At what utilization does cloud become cheaper?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 2}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3097", "title": "TCO Design: Spot vs On-Demand vs Reserved Instance Strategy", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What Spot, On-Demand, and Reserved H100 mix meets the 90% reliable-capacity floor, and what is the effective cost per H100-hour?", "chain_ids": ["cloud-chain-auto-018-03"], "chain_positions": {"cloud-chain-auto-018-03": 1}, "chain_tiers": {"cloud-chain-auto-018-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3098", "title": "TCO Design: Cost Per Inference for Production API Service", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Calculate: cost per 1M tokens, revenue required per 1M tokens to achieve 40% gross margin, and how INT4 quantization (fit on 2 GPUs) changes the economics?", "chain_ids": ["cloud-chain-auto-018-02"], "chain_positions": {"cloud-chain-auto-018-02": 2}, "chain_tiers": {"cloud-chain-auto-018-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3099", "title": "TCO Evaluation: H100 vs TPU v5e for Training Cost", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the training time, compute cost, and cost per effective FLOP for H100 vs TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3100", "title": "TCO Evaluation: Cost of Quantization for Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the cost per 1M tokens for FP16, INT8, and INT4 LLaMA-13B, and which quantization is the best cost-quality tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3101", "title": "TCO Mastery: Build vs Buy Decision for LLM Training Infrastructure", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct annual power cost for a 256-GPU cluster at 700W per GPU, $0.08/kWh, and PUE 1.3?", "chain_ids": ["cloud-chain-auto-018-01"], "chain_positions": {"cloud-chain-auto-018-01": 4}, "chain_tiers": {"cloud-chain-auto-018-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3102", "title": "TCO Mastery: Carbon Cost and Sustainable AI Infrastructure", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the operational carbon and offset cost for GPT-3 training on 8 H100s at 45% MFU, and how do coal versus renewable power sources compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3103", "title": "TCO Optimization: Rightsize GPU Fleet for Inference Workload", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much annual GPU spend is wasted by keeping 150 GPUs on 24/7, and what autoscaling plan would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3104", "title": "TCO Optimization: Training Efficiency vs Infrastructure Cost", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the break-even total infrastructure budget required to justify the $60K engineering investment, and what are the ongoing cost savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3105", "title": "H100 Training Cost Check for a 13B Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What H100 FP16 throughput, 8-GPU aggregate FLOPs, and two-run training cost should they use for the budget check?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3106", "title": "TCO Realization: Annual GPU Cost for GPT-4-Scale Service", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GPT-4 serving pods and GPUs are needed for 100M queries/day, and what are the annual and monthly GPU costs?", "chain_ids": ["cloud-chain-auto-018-02"], "chain_positions": {"cloud-chain-auto-018-02": 1}, "chain_tiers": {"cloud-chain-auto-018-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3107", "title": "TCO Specification: Design Cost-Optimal Inference for SLO", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the minimum-cost fleet mix of FP16 and INT4 pods to meet the throughput and latency SLOs, and what is its cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3108", "title": "TCO Specification: Design Multi-Region Inference Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should you allocate pods across US, EU, and APAC for 500 req/s, what is the annual cost, and why not centralize in US-East?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3109", "title": "Transformer Cost Design Optimal Architecture for Inference Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What model configuration and H100 fleet meet P99<500ms, 100 req/s, and <$0.001/request starting from GPT-2 large?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 3}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3110", "title": "Transformer Cost Evaluation: GPT-2 vs LLaMA-7B Inference Cost", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the decode throughput, request latency, and cost per 1M tokens for GPT-2 large BF16 versus LLaMA-7B INT4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3111", "title": "Transformer Cost Evaluation: Scaling Law Cost Prediction", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using C=6ND, what are the training costs for 70B on 1.4T tokens and 7B on 1T tokens, and which run is Chinchilla-aligned?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3112", "title": "Transformer Cost Fluency: FLOPs Estimation from Memory", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 6ND GPT-3 training FLOPs, 70B inference FLOPs per token, and H100-days to train GPT-3 at 40% MFU?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 1}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3113", "title": "Transformer Cost Implement: Chinchilla Optimal Model Size Calculation", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a $100K budget, what Chinchilla-optimal model size and token count should you train, and how does it compare to a 7B run?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 2}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3114", "title": "Transformer Cost Mastery: Full Training and Inference Cost Analysis for LLM Product", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are the training cost, inference fleet cost at 1000 req/s, and break-even monthly revenue for the 30B LLM product?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 4}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3115", "title": "Speculative Decoding Speedup for 70B INT4 with 7B Draft", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What speedup, cost reduction, and memory overhead does speculative decoding with a 7B draft model provide for the 70B INT4 service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3116", "title": "Transformer Cost Realization: Concrete FLOP Count for BERT Training", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are BERT-base training FLOPs by 6ND and by layer-by-layer counting, and how do the two estimates compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3117", "title": "Transformer Cost Realization: Size Activation Memory for LLM Training", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the peak activation memory with and without checkpointing, and is model parallelism required on an 80GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3118", "title": "Analyze GPipe Bubble Overhead in Pipeline Parallelism", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does reducing from 16 to 8 micro-batches affect the pipeline bubble ratio and throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3119", "title": "Design 1F1B Pipeline Schedule for LLM Training on H100 Cluster", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What 1F1B schedule would you use, and what are its steady-state throughput and bubble ratio versus GPipe?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 3}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3120", "title": "Design Interleaved Pipeline Schedule for Reduced Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "With v=2 interleaving, what are the new bubble ratio, steady-state efficiency, and added communication cost?", "chain_ids": ["cloud-chain-auto-023-16"], "chain_positions": {"cloud-chain-auto-023-16": 2}, "chain_tiers": {"cloud-chain-auto-023-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3121", "title": "Diagnose Load Imbalance Across Pipeline Stages on H100", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the 4-stage pipeline imbalance, and how would you rebalance the stages to recover throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3122", "title": "Diagnose Head-of-Line Blocking in LLM Serving Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the 4.2s P99 latency at 60% utilization, and how would you fix the scheduler?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3123", "title": "Recall Systolic Array Weight-Stationary Dataflow", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In a weight-stationary 256x256 systolic array, what data stays stationary, how many times are weights read for a batch of 64 inputs, and how does this compare to naive execution?", "chain_ids": ["cloud-chain-auto-secondary-010-23"], "chain_positions": {"cloud-chain-auto-secondary-010-23": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3124", "title": "Diagnose Systolic Array Underutilization for Non-Square Matrices", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does TPU v5e utilization collapse to 0.4% at batch size 1, and what minimum batch gives over 50% utilization?", "chain_ids": ["cloud-chain-auto-secondary-010-24"], "chain_positions": {"cloud-chain-auto-secondary-010-24": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3125", "title": "Recall TCO CapEx vs OpEx Split for GPU Cloud Training", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the fully-loaded hourly cost per GPU including CapEx amortization and OpEx?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3126", "title": "Implement Cost Allocation Per Training Run on Shared H100 Cluster", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate per-job costs, allocate shared overhead, and attribute idle time on the 64-GPU cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3127", "title": "Diagnose TCO Anomaly from Checkpoint I/O Costs on Cloud", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is checkpoint frequency or storage retention the primary driver of the $180K storage and egress bill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3128", "title": "TCO Fluency: Compute Cost per Token at Scale", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the serving cost per 1M output tokens for GPT-3 175B on 4 GPUs at $2.50/GPU-hour?", "chain_ids": ["cloud-chain-auto-018-02"], "chain_positions": {"cloud-chain-auto-018-02": 0}, "chain_tiers": {"cloud-chain-auto-018-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3129", "title": "Recall Transformer Inference FLOPs Formula", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the standard FLOPs approximation formulas for inference and training, and how do they scale with sequence length?", "chain_ids": ["cloud-chain-auto-006-01"], "chain_positions": {"cloud-chain-auto-006-01": 0}, "chain_tiers": {"cloud-chain-auto-006-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3130", "title": "Analyze Chinchilla Compute-Optimal Training Budget on H100", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What compute-optimal model size and token count does a $5M H100 training budget buy, and how does it compare to GPT-3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3131", "title": "Diagnose Memory-Bandwidth-Bound Decode on H100 Serving Cluster", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is batch-1 decode for the 70B INT8 model compute-bound or memory-bandwidth-bound, and why is throughput about 52 tokens/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3132", "title": "Specify KV Cache Memory Requirements for Long-Context LLM Serving", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What KV cache size, total VRAM need, remaining weight budget, and minimum GPU count are required for 32 requests at 128K context?", "chain_ids": ["cloud-chain-auto-006-07"], "chain_positions": {"cloud-chain-auto-006-07": 2}, "chain_tiers": {"cloud-chain-auto-006-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3133", "title": "FlashAttention Tile Size vs SRAM Capacity", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a model with head_dim=128 in FP16, what is the maximum tile size (B_r x B_c) that fits in SRAM, and why does exceeding it force spills to HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3134", "title": "FlashAttention IO Complexity vs Standard Attention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Compare the HBM IO bytes of standard attention vs FlashAttention, and estimate the wall-clock difference on a 2 TB/s GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3135", "title": "PagedAttention Memory Fragmentation", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV-cache VRAM is wasted per GPU by preallocating 4096 tokens when the average length is 512, and how does PagedAttention eliminate it?", "chain_ids": ["cloud-chain-auto-014-04"], "chain_positions": {"cloud-chain-auto-014-04": 1}, "chain_tiers": {"cloud-chain-auto-014-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3136", "title": "Implementing FlashAttention's Online Softmax", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What online-softmax rescaling rule should FlashAttention use when a new K tile raises the row max, and what FLOP overhead does it add?", "chain_ids": ["cloud-chain-auto-014-06"], "chain_positions": {"cloud-chain-auto-014-06": 1}, "chain_tiers": {"cloud-chain-auto-014-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3137", "title": "Diagnosing FlashAttention Regression on Short Sequences", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can FlashAttention-2 be slower than standard attention at seq_len=64 but much faster at seq_len=2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3138", "title": "Specifying KV Cache Budget for PagedAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What per-GPU PagedAttention block size, block budget, and KV memory reservation are required, and does the 512-request target fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3139", "title": "FlashAttention Arithmetic Intensity Calculation", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of standard attention versus FlashAttention, and does FlashAttention shift it to being compute-bound?", "chain_ids": ["cloud-chain-auto-014-02"], "chain_positions": {"cloud-chain-auto-014-02": 1}, "chain_tiers": {"cloud-chain-auto-014-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3140", "title": "Evaluating Ring Attention vs FlashAttention for 128K Context", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 128K context on 8 H100s, how do FlashAttention-2 with Ulysses and Ring Attention compare in communication, memory, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3141", "title": "Designing a Prefix Caching Strategy for RAG", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design prefix caching for the 50 shared 2048-token prompts, and what memory cost, hit rate, and TTFT gain result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3142", "title": "Optimizing FlashAttention Tile Size for MI300X", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does MI300X's 64KB LDS change FlashAttention tile size for head_dim=128 FP16, and what throughput impact should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3143", "title": "Full-Stack FlashAttention Deployment Decision", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the complete attention stack, KV cache management, sequence parallelism, and prefix caching strategy, and why is 810 TFLOPs per token a unit error?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3144", "title": "Sliding Window Attention Memory Savings", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much KV cache does 32K context use with 16 sliding-window and 16 full-attention layers versus full attention on all 32 layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3145", "title": "FlashAttention-2 vs FlashAttention-1 Parallelism", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does FlashAttention-2 achieve higher occupancy than FlashAttention-1 for an H100 workload (batch=1, 32 heads, seq_len=16384, head_dim=128)?", "chain_ids": ["cloud-chain-auto-014-02"], "chain_positions": {"cloud-chain-auto-014-02": 2}, "chain_tiers": {"cloud-chain-auto-014-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3146", "title": "KV Cache Quantization for Tensor-Parallel Serving", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What KV-cache quantization strategy would let the 4-GPU 70B service reach 500 concurrent requests, and what quality tradeoff does it make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3147", "title": "FlashDecoding for Long-Context Decode", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much speedup should FlashDecoding provide at 32K context, and why does splitting the KV sequence across blocks help?", "chain_ids": ["cloud-chain-auto-014-07"], "chain_positions": {"cloud-chain-auto-014-07": 1}, "chain_tiers": {"cloud-chain-auto-014-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3148", "title": "Chunked Prefill Scheduling with FlashAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "With a 50ms decode SLO and 30ms prefill budget on 8 H100s, what chunk size should chunked prefill use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3149", "title": "PagedAttention Block Size Tradeoff", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did switching PagedAttention from block_size=16 to block_size=1 improve memory utilization but increase decode latency by 35%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3150", "title": "GQA-Aware FlashAttention Kernel Design", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV bandwidth does a naive GQA FlashAttention implementation waste, and how would you tile to reuse each KV head across 8 query heads?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3151", "title": "FlashAttention Backward Pass Memory Savings for Training", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much activation memory does FlashAttention save by recomputing attention for seq_len=8192, and does it enable a larger batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3152", "title": "Multi-Query Attention vs GQA with FlashAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do MQA and GQA-8 compare for decode throughput, KV cache size, and quality at batch=64 and seq_len=4096?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3153", "title": "FlashAttention FP8 on H100 Transformer Engine", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What throughput speedup should FP8 FlashAttention-3 deliver over FP16 at seq_len=8192, and when does FP8 attention hurt quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3154", "title": "PagedAttention Copy-on-Write for Beam Search", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How much KV cache memory does naive beam_width=4 waste at 2048 tokens, and how does PagedAttention copy-on-write avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3155", "title": "FlashAttention Causal Masking Efficiency", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What fraction of 128×128 FlashAttention tiles are skipped by the causal mask at seq_len=4096, and what speedup results for prefill and decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3156", "title": "Ring Attention Communication-Computation Overlap Budget", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is the 400 Gb/s InfiniBand link a bottleneck for Ring Attention across 16 GPUs at 256K context, and what ring ordering should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3157", "title": "Prefix Caching Eviction Policy Design", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which eviction policy (LRU, LFU, ARC) maximizes the KV cache hit rate for a power-law prompt distribution, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3158", "title": "FlashAttention-3 Asynchronous Softmax Pipelining", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For seq_len=8192, head_dim=128, and tile_size=128 on H100, what pipeline efficiency and speedup does FlashAttention-3 achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3159", "title": "FlashAttention Numerical Stability in Mixed Precision Training", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why must FlashAttention keep row max and log-sum-exp in FP32 on H100, and what fails if they are kept in FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3160", "title": "Speculative Decoding Acceptance Rate Fundamentals", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "If the acceptance rate is alpha=0.8 per token, what is the expected number of accepted tokens per speculation round, and what is the effective speedup over autoregressive decoding?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 2}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3161", "title": "Speculative Decoding Throughput Degradation Under Load", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can K=5 speculative decoding with a 7B draft reduce throughput by 15% for 200 concurrent requests?", "chain_ids": ["cloud-chain-auto-025-11"], "chain_positions": {"cloud-chain-auto-025-11": 1}, "chain_tiers": {"cloud-chain-auto-025-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3162", "title": "Designing a Draft Model Selection Strategy", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which draft option is best when acceptance rate and draft latency both affect speculative decoding speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3163", "title": "Implementing Rejection Sampling for Lossless Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does rejection sampling preserve the target distribution, and why is KL(p||q)=0.5 insufficient for a numeric acceptance rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3164", "title": "Diagnosing Speculation Failure on Code Generation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At alpha=0.45 and K=5, how many tokens per speculation step are expected, and can this make code generation slower than baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3165", "title": "Comparing Speculation Length K Under Draft Cost", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At 80 ms target decode, 3 ms/token draft cost, and alpha=0.78, does K=5 or K=15 give higher throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3166", "title": "Speculative Decoding Memory Bandwidth Analysis", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What effective memory-bandwidth amplification does K=5 speculative decoding with alpha=0.80 provide over batch-1 autoregressive decode?", "chain_ids": ["cloud-chain-auto-025-10"], "chain_positions": {"cloud-chain-auto-025-10": 0}, "chain_tiers": {"cloud-chain-auto-025-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3167", "title": "Tree-Structured Speculation vs Linear Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which produces more served tokens per verification round, and what is the verification cost difference?", "chain_ids": ["cloud-chain-auto-025-10"], "chain_positions": {"cloud-chain-auto-025-10": 1}, "chain_tiers": {"cloud-chain-auto-025-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3168", "title": "Designing Medusa Heads for Self-Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the Medusa heads for a 70B target with hidden_dim=8192 on 4x H100, and what is their overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3169", "title": "Speculative Decoding KV Cache Management", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If draft tokens 1-3 are accepted and token 4 is rejected, which target KV cache entries are kept, discarded, and how is the correction KV stored?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3170", "title": "Diagnosing Speculation Latency Regression at High Temperature", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does alpha fall from 0.82 at temperature 0.1 to 0.55 at temperature 1.0, and how would you fix the slowdown?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3171", "title": "Draft Model Placement on Multi-GPU Inference", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For a 405B target on 8 GPUs, should the 7B draft run with TP=8, on one of the 8 GPUs, or on a separate 9th GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3172", "title": "Speculative Decoding with Continuous Batching", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should continuous batching verify requests with variable speculation lengths in a single target forward pass?", "chain_ids": ["cloud-chain-auto-025-11"], "chain_positions": {"cloud-chain-auto-025-11": 2}, "chain_tiers": {"cloud-chain-auto-025-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3173", "title": "Adaptive Speculation Depth Based on Token Entropy", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Design an entropy-based adaptive K system that monitors draft model confidence and adjusts speculation depth token-by-token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3174", "title": "Full-Stack Speculative Decoding Architecture", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What throughput can 128 H100s provide if there are 16 8-GPU replicas, batch 16 per replica, 3.5 tokens per step, and 84 ms steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3175", "title": "Speculative Decoding Token Probability Calibration", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why must speculative decoding use the full vocabulary probability distributions rather than just comparing top-1 tokens, and what property does the rejection sampling procedure guarantee?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3176", "title": "Speculative Decoding vs Increasing Batch Size", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For options (A) speculative decoding (K=5), (B) batch=8, and (C) both, how do latency, throughput, and GPU utilization compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3177", "title": "Sizing an EAGLE Draft Head for a 70B Model", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an EAGLE-style draft head for a 70B target with hidden_dim=8192, and what memory and training costs follow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3178", "title": "Speculative Decoding Verification Compute Cost", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does verifying K=5 draft tokens take about the same wall-clock time as one batch-1 autoregressive step, and what is the cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3179", "title": "Speculative Decoding for Fill-in-the-Middle Tasks", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does fill-in-the-middle generation affect 7B draft acceptance for a 70B target, and how should speculation be modified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3180", "title": "Speculative Decoding Impact on Time-to-First-Token", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did speculative decoding increase TTFT from 200 ms to 350 ms, and how would you fix the first-token delay?", "chain_ids": ["cloud-chain-auto-025-11"], "chain_positions": {"cloud-chain-auto-025-11": 0}, "chain_tiers": {"cloud-chain-auto-025-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3181", "title": "Parallel Draft and Target Execution Scheduling", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you overlap draft generation of round N+1 with target verification of round N, and what rollback issue arises?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3182", "title": "Speculative Decoding Acceptance Break-Even", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For K=5, what alpha makes speculative decoding break even, and what expected-token formula applies?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 3}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3183", "title": "Speculative Decoding for Batched Multi-Turn Chat", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Does the 7B draft KV cache for 8K-token histories limit capacity for 100 concurrent chat sessions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3184", "title": "Staged Speculation for Ultra-Long Generation", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should speculation be staged across a 10K-token 405B generation, and why does tree speculation help most in the body?", "chain_ids": ["cloud-chain-auto-025-10"], "chain_positions": {"cloud-chain-auto-025-10": 2}, "chain_tiers": {"cloud-chain-auto-025-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3185", "title": "Speculative Decoding Cost-Benefit for Serving Economics", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you meet a 15 ms/token P50 target by adding speculation or doubling from 64 to 128 H100s, and what are the cost and tail-latency tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3186", "title": "MoE All-to-All Communication Volume", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a local micro-batch of 2048 tokens per GPU, what is the all-to-all communication volume per layer in each direction?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 1}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3187", "title": "MoE Capacity Factor and Token Dropping", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you raise the MoE capacity factor from 1.25 to 2.0 to fix 8% token drops, or address routing collapse another way?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3188", "title": "MoE Routing Collapse During Training", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the stalled training and low router entropy despite a converged load-balancing loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3189", "title": "MoE Memory Footprint vs Dense Equivalent", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many GPUs does each require, and what is the memory efficiency gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3190", "title": "MoE Expert Replication for Hot Experts", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you reduce P99 latency when 2 of 8 MoE experts receive 40% of tokens, and what hardware cost it adds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3191", "title": "MoE Gating Network Overhead", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the parameter count of the router, and what fraction of total layer parameters does it represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3192", "title": "MoE Expert-Choice vs Token-Choice Routing", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the training and serving tradeoffs between token-choice and expert-choice routing for a 64-expert MoE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3193", "title": "MoE Serving with Offloaded Experts", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you offload experts for a 671B, 256-expert MoE on one 8x H100 node, and what latency does PCIe fetching add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3194", "title": "MoE Auxiliary Loss Coefficient Tuning", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you tune MoE routing when α=0.01 leaves dead experts but α=0.1 hurts perplexity?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 2}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3195", "title": "Shared Expert Impact on MoE All-to-All Traffic", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does adding one shared expert alongside 8 routed experts reduce MoE all-to-all communication, and what bottleneck can it introduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3196", "title": "MoE Training Throughput vs Dense Model", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 8x7B MoE 34% slower than a dense 13B on 64 H100s despite similar active FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3197", "title": "MoE Granularity: Few Large vs Many Small Experts", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the systems tradeoffs between 8x125B coarse experts and 256x4B top-8 fine-grained experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3198", "title": "MoE Batch Size Impact on Expert Utilization", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "If batch size 4 produces 8 independent expert choices over 8 experts, how many distinct experts are expected to be activated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3199", "title": "MoE Quantization Strategy for Serving", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using consistent parameter counts for Mixtral 8x22B, how much memory does a BF16-shared, INT4-expert quantization plan need, and how many 80GB GPUs are required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3200", "title": "MoE Fine-Grained Expert Parallelism Communication", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What communication constraints and placement rules should the all-to-all expert dispatch use across 32 H100 nodes with NVLink and NDR?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 2}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3201", "title": "MoE Prefill vs Decode Phase Routing Differences", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What parallelism layouts should you use for MoE prefill and decode, and why should they differ?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3202", "title": "MoE Parameter Count vs Active FLOPs Scaling", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-token FLOPs and total parameters for the dense 70B and 8x MoE, and why can MoE match quality at lower cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3203", "title": "MoE Checkpointing Overhead at Scale", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What checkpoint size, write time, and nonblocking strategy are needed to checkpoint the 671B MoE every 15 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3204", "title": "MoE Inference Token Routing Latency Breakdown", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency breakdown for one MoE decode layer at batch 32, including router, dispatch, expert compute, and combine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3205", "title": "MoE vs Dense Scaling Law Crossover", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the estimated GPT-4-scale FLOP budget, and what is the exact loss reduction from scaling compute by 8×?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 3}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3206", "title": "MoE Auxiliary Loss Interaction with Gradient Accumulation", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does changing gradient accumulation from 1 to 8 destabilize MoE routing, and how would you fix the load-balancing signal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3207", "title": "MoE Expert Specialization Analysis", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this specialization desirable or a sign of poor routing, and how does this affect serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3208", "title": "MoE All-to-All Overlap with Expert Computation", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can K=4 sub-batching overlap all-to-all with expert compute when communication is 40% of the step, and what speedup should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3209", "title": "MoE Dropless (dMoE) Token Processing Guarantee", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does dropless MoE eliminate token dropping, and what are the memory implications on H100 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3210", "title": "MoE Multi-Node Placement Strategy", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should you place 256 top-8 experts across 4 H100 nodes when some expert pairs are co-activated 70% of the time?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 3}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3211", "title": "Tiered MoE Expert Caching Across HBM DRAM and NVMe", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a tiered caching system for 1024 two-GB experts across GPU HBM, CPU DRAM, and NVMe SSDs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3212", "title": "MoE Serving Cost-per-Token Economics", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What active and full FP16 weight footprints should you use for Mixtral 8x7B in the serving cost comparison, and why do both models require the same number of GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3213", "title": "MoE Sparse Upcycling from Dense Model", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the systems costs and expected quality tradeoffs of sparse upcycling a dense 7B model into an 8-expert MoE versus training from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3214", "title": "DeepSeek-V3-Scale MoE Parallelism Layout on 2048 H100s", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What EP, TP, PP, and DP dimensions would you use to train this 671B, 256-expert MoE on 2048 H100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3215", "title": "KV-Cache Size Calculation for GQA Models", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the per-layer, per-token-total, and batch-32 BF16 KV-cache sizes at 4096 context with 8 KV heads?", "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 1}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3216", "title": "PagedAttention Block Size Selection", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which PagedAttention block size—8, 16, or 32 tokens—should you choose for this variable-length workload, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3217", "title": "KV-Cache Quantization to INT8", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can INT8 KV-cache quantization double concurrency from 64 to 128 at 4K context, and what quality impact should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3218", "title": "Prefix Caching for Shared System Prompts", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much KV-cache memory and TTFT can prefix caching save when 80% of requests share the 2048-token system prompt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3219", "title": "KV Cache Eviction Under Memory Pressure", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What eviction and admission policy should you use to handle 80 requests requiring up to 204 GB of KV-cache with only a 35 GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3220", "title": "KV-Cache Memory as Throughput Bottleneck", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the Llama 3 8B H100 instance limited to 40% compute utilization despite 200 queued requests and nearly full memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3221", "title": "Continuous Batching Interaction with KV-Cache", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does moving from static to continuous batching change KV-cache allocation, eviction, and scheduling requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3222", "title": "KV-Cache Disaggregation for Prefill-Decode Split", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How large is the Llama 3 70B KV-cache for an 8K prefill, and how should the prefill pool hand it off to the decode pool?", "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 3}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3223", "title": "GQA vs MQA vs MHA KV-Cache Tradeoff", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which attention variants are memory-feasible for batch=16 at 128K context on one GPU, and what changes if none fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3224", "title": "KV-Cache Impact on Decode Memory Bandwidth", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At position 4096, what per-token HBM traffic comes from KV-cache reads versus resident 70B weight reads, and which dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3225", "title": "KV-Cache Memory Fragmentation in Long-Running Services", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does vLLM reject new requests despite 30 GB free GPU memory and only 22 GB of 35 GB allocated KV-cache actively used?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3226", "title": "KV-Cache Compression via Sliding Window + Sink Tokens", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should you manage KV-cache for 20K-token conversations on Mistral 7B while respecting its 4096-token sliding window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3227", "title": "KV-Cache Aware Autoscaling Policy", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What autoscaling metric should replace GPU utilization when requests are rejected at 45% GPU utilization during a traffic spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3228", "title": "KV-Cache Memory Planning for Multi-Turn Chat", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does KV-cache grow over 6 turns for Llama 3 8B, and how many concurrent conversations fit on one A100-80GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3229", "title": "KV-Cache Recomputation vs Storage Tradeoff", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For Llama 3 70B on 2×H100, which is more economical: storing the KV-cache or recomputing it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3230", "title": "KV-Cache and Speculative Decoding Interaction", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should speculative decoding manage draft and verifier KV-caches when 5 of 8 candidate tokens are accepted?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3231", "title": "KV-Cache Scaling with Context Length Doubling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does doubling context length affect KV-cache memory, decode latency, and maximum batch size for the 96-layer GQA model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3232", "title": "KV-Cache Sharing for Parallel Sampling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can best-of-16 sampling share KV-cache for the common 2048-token prompt, and how much memory does it save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3233", "title": "Disaggregated KV-Cache Storage Architecture", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a 100 GB/s CXL-backed distributed KV-cache for Llama 3 70B, and when does it outperform local memory KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3234", "title": "KV-Cache Aware Request Scheduling", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the load balancer be modified to prevent replica 1 from rejecting requests while replicas 2-4 remain underutilized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3235", "title": "KV-Cache Growth in RAG Pipelines", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does the RAG context growth to 16,128 tokens scale KV-cache memory, and what does that imply for serving cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3236", "title": "KV-Cache Pool Sizing for Throughput Optimization", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How much memory per GPU should be allocated to KV-cache, and what request concurrency does it support?", "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 2}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3237", "title": "KV-Cache Deduplication Across Requests", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What fraction of total traffic can reuse KV-cache entries under content-aware deduplication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3238", "title": "KV-Cache Memory vs Compute Roofline During Decode", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV-cache data must be streamed per layer for one Llama 3 8B decode token at sequence position 2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3239", "title": "Managing Multi-Modal KV-Cache Memory for Mixed Image and Text Batches", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you manage KV-cache memory for 8 mixed text-only and image+text requests with 1600 visual tokens and 512 text tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3240", "title": "Diagnosing Workload Bottlenecks on NVIDIA H100 with Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use a roofline model on the H100 to diagnose the 50 TFLOPS FP16 workload, and what are examples of memory-bound versus compute-bound operations?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 3}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3241", "title": "Optimizing Large Language Model Inference on AMD MI300X with Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the roofline model, is the 70B FP16 LLM on MI300X compute-bound or memory-bound, and how would you improve utilization?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 5}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3242", "title": "Optimizing Large Transformer Inference on NVIDIA H100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you structure H100 CUDA kernels to maximize occupancy, coalesced memory access, and FP16 Tensor Core utilization for transformer inference?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3243", "title": "Diagnosing Underperforming LLM Inference on AMD MI300X", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What MI300X architectural factors could explain 1 TFLOPS at batch size 1 despite high utilization and only 1 TB/s HBM bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3244", "title": "Optimizing Large Language Model Inference on NVIDIA H100 for High-Throughput", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile and optimize the 60 GB FP16 LLM on one H100 for occupancy, Tensor Cores, coalesced memory access, and latency-throughput trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 3}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3245", "title": "NVIDIA H100 vs. CPU: Optimizing Large Language Model Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is an NVIDIA H100 generally better than a multi-core CPU for LLM inference, and when might a CPU still be preferable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3246", "title": "LLM Inference Scaling on Google TPU v5e", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a single TPU v5e unsuitable for a 70B BF16 LLM, and how would you shard the model across accelerators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3247", "title": "AMD MI300X Accelerator Selection for Large Language Model Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an MI300X-based LLM inference architecture and justify it against GPUs, TPUs, and custom ASICs?", "chain_ids": ["cloud-chain-auto-secondary-016-01"], "chain_positions": {"cloud-chain-auto-secondary-016-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3248", "title": "Optimizing Large Language Model Inference on a Single NVIDIA A100", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What redesign is required when a 175B FP16 LLM cannot fit on one 80 GB A100 for low-latency serving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3249", "title": "Real-time LLM Inference: Accelerator Selection for Performance and Cost", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What serving architecture and accelerator choice would meet 100 ms latency and 10,000 QPS for the BF16 transformer on Google Cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3250", "title": "Accelerator Selection for Large Language Model Inference on AMD MI300X", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How suitable is the MI300X for batch-1 FP16 inference of a 70B LLM compared with CPUs or previous-generation GPUs?", "chain_ids": ["cloud-chain-auto-secondary-016-01"], "chain_positions": {"cloud-chain-auto-secondary-016-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3251", "title": "NVIDIA A100 vs. High-End CPU for LLM Training", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare a GPU with dual CPUs for FP16 LLM training in performance, cost-efficiency, and programmability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3252", "title": "Optimizing Large Language Model Inference on AMD MI300X", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you serve a 100B FP16 LLM on MI300X versus a custom ASIC or CPUs, and optimize data flow despite the 192 GB HBM limit?", "chain_ids": ["cloud-chain-auto-secondary-016-01"], "chain_positions": {"cloud-chain-auto-secondary-016-01": 2}, "chain_tiers": {"cloud-chain-auto-secondary-016-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3253", "title": "Optimizing Large Language Model Inference on Google TPU v5e with Systolic Arrays", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you tile the 20B BF16 transformer's GEMMs on TPU v5e, and when would you choose weight-stationary versus output-stationary dataflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3254", "title": "Optimizing Dense GEMM on AMD MI300X with Weight-Stationary Dataflow", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would a weight-stationary dataflow work for the 65536x512 by 512x65536 FP16 GEMM on MI300X, and is it compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3255", "title": "Mapping Llama 3 70B FFN to a Systolic Array", "topic": "systolic-dataflow", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you map a Llama 3 70B FFN matrix multiply onto a systolic array, including dataflow choice, tiling, and structured sparsity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3256", "title": "AMD MI300X HBM/Compute Bottleneck and Cost Analysis for LLM Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the 100 TFLOP, 100 GB-per-inference MI300X service compute-bound or memory-bound, and what are the max inferences/sec and cost/inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3257", "title": "A100 LLM Cost Analysis: Training & Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate A100 GPU-hours and cost for 10^24 FLOP training plus 100 GPU-hours of inference, and what bottlenecks affect accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3258", "title": "Compute Cost Estimation for a Large Language Model on NVIDIA H100s", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate GPU-hours and cost to train a 70B LLM on 1T tokens at 60% utilization and $3.50 per GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3259", "title": "LLM Training Cost Estimation on Google TPU v5e", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate TPU v5e device-hours and dollar cost to train a 70B model on 1T tokens at 40% efficiency, and what else matters beyond FLOPs?", "chain_ids": ["cloud-chain-auto-secondary-015-08"], "chain_positions": {"cloud-chain-auto-secondary-015-08": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3260", "title": "H100 Training Cost for Large Language Model", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What GPU-hours, instance cost, electricity cost, and total cost do you estimate for the 100B-parameter training run?", "chain_ids": ["cloud-chain-auto-secondary-015-06"], "chain_positions": {"cloud-chain-auto-secondary-015-06": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3261", "title": "Optimizing Cost-Performance on Google TPU v5e for LLM Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate TPU v5e device count, device-hours, and cost to serve a 70B BF16 model at 200 ms P99 and 1000 QPS?", "chain_ids": ["cloud-chain-auto-secondary-015-08"], "chain_positions": {"cloud-chain-auto-secondary-015-08": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3262", "title": "LLM Fine-tuning Cost Estimation on AMD MI300X", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate MI300X GPU-hours and cost to fine-tune a 70B FP16 LLM on 1T tokens, and how could memory bandwidth change the estimate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3263", "title": "LLM Training Cost Estimation on NVIDIA H100", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate total FLOPs, GPU-hours, and dollar cost to train a 70B model on 2T tokens, and how would you refine it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3264", "title": "VRAM Budgeting for 70B LLM on AMD MI300X", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you account for VRAM used by FP16 weights, AdamW state, activations, and KV-cache for 70B training and batch-8 inference?", "chain_ids": ["cloud-chain-auto-008-19"], "chain_positions": {"cloud-chain-auto-008-19": 0}, "chain_tiers": {"cloud-chain-auto-008-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3265", "title": "KV-Cache Optimization on AMD MI300X for Varying Context Lengths", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache management change would you make to avoid OOMs and erratic latency with frequent 64k-token requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3266", "title": "Optimizing LLM KV-Cache on NVIDIA A100 for Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache architecture would you use on A100s to handle 32k-64k contexts without OOMs while maximizing throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3267", "title": "NVIDIA H100 KV-Cache Capacity Planning", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "With 80 GB of HBM used only for BF16 KV-cache, how many total tokens can be stored for 96 heads of dimension 128?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3268", "title": "Diagnosing KV-Cache Eviction on Google TPU v5e", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose this issue on the TPU v5e, pinpointing the root cause, and what initial steps would you take to mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3269", "title": "Optimizing KV-Cache on AMD MI300X for High-Throughput LLM Inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design KV-cache paging and eviction on MI300X to maximize concurrent 128k-token contexts with 64 KB of KV per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3270", "title": "A100 KV-Cache Management for Long Context LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where is the memory pressure for 32k-context LLM inference on an 80 GB A100, and how would you prevent OOMs while maximizing throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3271", "title": "Optimizing KV-Cache on Google TPU v5e for Long Context LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you size, page, and evict the KV-cache for 128k-context 7B inference on a 16 GB TPU v5e to reduce pressure and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3272", "title": "Optimizing KV-Cache Management on NVIDIA A100 for Long-Context LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design and size a paged KV-cache system on 80 GB A100s for long-context serving with dynamic eviction and low tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3273", "title": "NVIDIA A100 Memory Hierarchy Bottlenecks for Large Models", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is a 150B FP16 LLM problematic on a single 80 GB A100 despite a 50 GB active working set, and what memory-hierarchy trade-offs result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3274", "title": "Analyzing Compute-Memory Tradeoffs with Gradient Checkpointing on NVIDIA H100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does gradient checkpointing reduce observed TFLOPS, and how would you optimize the compute-memory trade-off?", "chain_ids": ["cloud-chain-auto-027-24"], "chain_positions": {"cloud-chain-auto-027-24": 1}, "chain_tiers": {"cloud-chain-auto-027-24": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3275", "title": "TPU v5e Activation Checkpointing Strategy for Large Language Models", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you incorporate activation checkpointing for a 100B-parameter MoE on TPU v5e when activations exceed 16 GB HBM per chip?", "chain_ids": ["cloud-chain-auto-027-25"], "chain_positions": {"cloud-chain-auto-027-25": 0}, "chain_tiers": {"cloud-chain-auto-027-25": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3276", "title": "Optimizing LLM Training with Gradient Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How much memory does checkpointing save for the 250 GB activation block, and what extra latency does recomputation add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3277", "title": "Optimizing Large Model Training on NVIDIA A100 with Gradient Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you apply gradient checkpointing to train the 50B Transformer on one 80 GB A100, and what compute-memory trade-off would you expect?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 2}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3278", "title": "H100 Memory Optimization for Large Language Models: Checkpointing vs. Parallelism", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose between aggressive checkpointing on one accelerator and pipeline parallelism across accelerators for a 1T-parameter MoE model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3279", "title": "TPU v5e Activation Memory & Gradient Checkpointing for LLMs", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you apply gradient checkpointing on a TPU to fit the LLM within 16 GB HBM while preserving training throughput?", "chain_ids": ["cloud-chain-auto-027-25"], "chain_positions": {"cloud-chain-auto-027-25": 1}, "chain_tiers": {"cloud-chain-auto-027-25": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3280", "title": "Optimizing Large Model Training on AMD MI300X with Activation Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose activation-memory OOMs and choose a selective checkpointing strategy that quantifies memory saved versus recompute cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3281", "title": "Optimizing Large Model Training with Gradient Checkpointing on A100", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would gradient checkpointing let this billions-parameter Transformer train on an 80 GB A100, and how would you tune the compute-memory tradeoff?", "chain_ids": ["cloud-chain-auto-027-23"], "chain_positions": {"cloud-chain-auto-027-23": 3}, "chain_tiers": {"cloud-chain-auto-027-23": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3282", "title": "TPU v5e Data Transfer Bottleneck Analysis", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are host-to-TPU activation transfers dominating throughput on TPU v5e, and how would you optimize data movement to reduce that bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3283", "title": "Optimizing Large Embedding Data Movement on AMD MI300X", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design host-device data movement to minimize latency for embeddings and KV cache transfers given PCIe Gen5 bottlenecks?", "chain_ids": ["cloud-chain-auto-secondary-017-18"], "chain_positions": {"cloud-chain-auto-secondary-017-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3284", "title": "Optimizing Data Movement for LLM Inference on Google TPU v5e", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the TPU v5e data movement strategy for real-time LLM inference when weights, activations, and datasets exceed 16 GB HBM?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 3}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3285", "title": "Optimizing Data Movement for LLM Inference: A100 vs. V100 Evaluation", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do host-device data movement, DMA, zero-copy, and pinned memory affect latency and throughput in the two architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3286", "title": "Optimizing Data Movement for Large Models on NVIDIA H100", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you minimize host-device transfer overhead on an H100 when fine-tuning a 100B model with large optimizer states or embeddings offloaded to CPU memory?", "chain_ids": ["cloud-chain-auto-secondary-017-18"], "chain_positions": {"cloud-chain-auto-secondary-017-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3287", "title": "TPU v5e Memory Management: Diagnosing OOMs in Large Model Training", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do OOMs still occur on 16 GB HBM despite optimizer offload, and how would you reduce the peak memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3288", "title": "Designing a Memory-Efficient LLM Inference System on AMD MI300X", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you manage memory for a 130B LLM on a 192 GB GPU to avoid OOMs and page thrashing while keeping inference latency low?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3289", "title": "Optimizing LLM Deployment on A100: Mitigating Memory Fragmentation and OOM", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate A100 memory fragmentation and OS-level eviction under peak load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3290", "title": "LLM Memory Pressure Management on NVIDIA H100 Cluster", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a distributed training strategy to prevent OOM errors, minimize fragmentation, and optimize gradient accumulation for a 175B LLM on H100s?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 2}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3291", "title": "LLM Training on AMD MI300X: Memory Pressure Management", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate OOMs on MI300X training runs, including fragmentation, gradient accumulation, and OS-level eviction effects?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3292", "title": "Optimizing 70B LLM Training on NVIDIA A100 Under Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and resolve these memory pressure issues to ensure stable and efficient training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3293", "title": "LLM Fine-tuning OOM on NVIDIA H100: Diagnosing and Mitigating Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate OOMs for a 100B distributed LLM, including fragmentation, gradient accumulation, offload, and OS eviction effects?", "chain_ids": ["cloud-chain-auto-secondary-014-28"], "chain_positions": {"cloud-chain-auto-secondary-014-28": 3}, "chain_tiers": {"cloud-chain-auto-secondary-014-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3294", "title": "LLM Inference Latency Decomposition on NVIDIA A100", "topic": "latency-decomposition", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce the 550 ms end-to-end latency across network, CPU, GPU, and postprocessing, and what are the specific bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3295", "title": "Optimizing Inference Latency on AMD MI300X with Dynamic Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign the batching strategy to balance p99 latency below 50ms with maximum throughput?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3296", "title": "Low-Latency LLM Inference Scheduling on GPUs", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the system, including scheduling policies, to leverage the hardware's capabilities effectively while meeting both latency and throughput KPIs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3297", "title": "Optimizing Real-time LLM Inference on NVIDIA H100s with Adaptive Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design batching and scheduling to minimize latency while sustaining 1000 QPS P99 throughput under variable arrivals?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3298", "title": "Optimizing Batching Strategies for LLM Inference on Google TPU v5e", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you choose among static, dynamic, and continuous batching on TPU v5e to maximize throughput while meeting a 100 ms P99 latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3299", "title": "Optimizing Real-time LLM Inference Latency and Throughput on NVIDIA A100 with Adaptive Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design adaptive batching for LLM inference to achieve sub-50 ms p99 latency while maximizing throughput under variable traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3300", "title": "Understanding Tail Latency in AMD MI300X Inference", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What causes P999 stragglers on the MI300X despite 5.3 TB/s HBM3 bandwidth, and what first mitigations would you try?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3301", "title": "NVIDIA H100 LLM Inference: Achieving P99 Latency SLAs", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you leverage techniques like straggler detection, hedged requests, and SLA-driven resource management to ensure this critical P99 target is always met?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3302", "title": "Optimizing Tail Latency for Real-time Inference on Google TPU v5e", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you reduce P999 latency below 100 ms on TPU v5e when most inferences take 10 ms but a small fraction become stragglers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3303", "title": "Designing a Low-Latency Inference Service on NVIDIA A100 with Strict SLAs", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the inference service to consistently meet the <100ms P99 and <250ms P999 tail latency SLA given variable traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3304", "title": "H100 Inference Service P99 Latency with Stragglers", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design your system to effectively monitor, mitigate, and predictably meet the P99 latency SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3305", "title": "Optimizing Tail Latency for Real-time ML Inference on Google TPU v5e", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What multi-TPU v5e architecture reduces stragglers to meet a 100ms P99 SLA, and how does it compare mathematically to a single TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3306", "title": "H100 LLM Training Latency Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the initial steps to diagnose this latency issue, and what specific H100 hardware characteristics should be considered during the analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3307", "title": "Optimizing Low-Latency LLM Inference on AMD MI300X: Profiling and Bottleneck Identification", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile an MI300X LLM inference service from system metrics down to GPU kernels to find the root causes of inconsistent latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3308", "title": "Optimizing LLM Inference Latency on NVIDIA A100 with Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use profiling and tracing tools to systematically determine whether high inference latency is compute-, memory-, or I/O-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3309", "title": "NVIDIA H100 Latency Diagnosis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you pinpoint the bottleneck causing 150 ms H100 inference latency when batch size is tuned, CPU is idle, and nvidia-smi shows 90% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3310", "title": "TPU v5e Latency Bottleneck Analysis for Large Language Models", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile TPU v5e inference to determine whether peak-time tail latency is compute-bound, memory-bound, or I/O-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3311", "title": "Optimizing Large Language Model Inference Latency on AMD MI300X", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile MI300X LLM inference to reduce 500 ms average latency toward 100 ms across compute, memory, and I/O bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3312", "title": "NVIDIA H100 Inference Queue Depth and Little's Law", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Little's Law to choose queue depth and capacity for a single accelerator while keeping average inference latency at 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3313", "title": "Optimizing Inference Latency on AMD MI300X with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Little’s Law and M/G/1 queueing to set arrival-rate limits and queue depth for 25 ms MI300X inference under a 50 ms p99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3314", "title": "Diagnosing H100 Inference Latency Spikes with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use queueing theory to diagnose P99 spikes above 1000 ms when average utilization is 60-70% but scheduler queue depth is high?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3315", "title": "Optimizing Real-Time ML Inference Latency on Google TPU v5e with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many TPU v5e devices and what queueing strategy would you use to serve 10,000 QPS with 50 ms average latency and 150 ms P99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3316", "title": "LLM Inference Queue Management on AMD MI300X", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you set per-GPU queue depth and capacity plans for 200 ms service time while keeping average end-to-end latency at 500 ms under spikes?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 0}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3317", "title": "Optimizing LLM Inference Latency with Queueing Theory on NVIDIA A100s", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you apply queueing theory to analyze performance, determine optimal queue depth, and plan capacity to meet the 200ms P99 latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3318", "title": "Optimizing LLM Inference Latency on AMD MI300X with Queueing Theory", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you apply queueing theory principles to determine optimal queue depths, manage arrival rates, and plan capacity to minimize latency and prevent resource starvation or underutilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3319", "title": "Weight Memory Sizing with Mixed-Precision Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Under weight-only assumptions, how many of 10B parameters must be quantized from BF16 to INT8 to fit within 16 GB of HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3320", "title": "Optimizing LLM Deployment with INT8 Quantization on AMD MI300X", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What technical steps and tradeoffs would you consider to convert the 100 GB FP16 LLM to INT8 on an MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3321", "title": "Quantization Strategy for LLM Deployment on NVIDIA A100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and justify an INT8/INT4 quantization strategy on an 80GB GPU, including PTQ vs QAT, granularity, zero-points, and expected gains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3322", "title": "H100 Mixed-Precision Performance Considerations", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you implement mixed-precision training on the GPU and verify it benefits from Tensor Cores and HBM bandwidth?", "chain_ids": ["cloud-chain-auto-secondary-015-02"], "chain_positions": {"cloud-chain-auto-secondary-015-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3323", "title": "Analyzing Mixed-Precision Performance on Google TPU v5e", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would migrating this FP32 LLM to BF16 mixed precision on TPU v5e affect memory, throughput, stability, and design choices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3324", "title": "Optimizing Large Language Model Training with Mixed Precision on AMD MI300X", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What mixed-precision training strategy would you use on MI300X for this FP32 LLM, and how would you manage stability and accuracy tradeoffs?", "chain_ids": ["cloud-chain-auto-secondary-015-04"], "chain_positions": {"cloud-chain-auto-secondary-015-04": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3325", "title": "Optimizing Large Model Training with Mixed Precision on NVIDIA A100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For a 175B LLM, what are FP32/FP16 parameter memory, A100 FP16 theoretical speedup, and why is loss scaling critical?", "chain_ids": ["cloud-chain-auto-secondary-015-05"], "chain_positions": {"cloud-chain-auto-secondary-015-05": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3326", "title": "H100 Mixed-Precision Training Instability and Performance Diagnosis", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of these stability and performance issues, considering the specific hardware capabilities?", "chain_ids": ["cloud-chain-auto-secondary-015-02"], "chain_positions": {"cloud-chain-auto-secondary-015-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3327", "title": "Fitting LLM Training on TPU v5e with BF16 Sharding", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you combine BF16, sharding, checkpointing, and offload to train an LLM on TPU v5e while preserving FP32-like accuracy?", "chain_ids": ["cloud-chain-auto-secondary-015-03"], "chain_positions": {"cloud-chain-auto-secondary-015-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3328", "title": "Optimizing Large Language Model Training with Mixed Precision on NVIDIA A100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you compare FP16 AMP with BF16 mixed precision, and which would you initially pursue for this LLM?", "chain_ids": ["cloud-chain-auto-secondary-015-05"], "chain_positions": {"cloud-chain-auto-secondary-015-05": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3329", "title": "Optimizing Large Model Training with Mixed-Precision on Google TPU v5e", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose OOM for a 70B FP32 LLM on one TPU v5e and decide whether mixed precision can fix it?", "chain_ids": ["cloud-chain-auto-secondary-015-03"], "chain_positions": {"cloud-chain-auto-secondary-015-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3330", "title": "Optimizing LLM Training with Mixed-Precision on AMD MI300X", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design mixed-precision training for a 175B LLM on MI300X, including dtype choices, loss scaling, stability, and accelerator sizing?", "chain_ids": ["cloud-chain-auto-secondary-015-04"], "chain_positions": {"cloud-chain-auto-secondary-015-04": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3331", "title": "NVIDIA A100 LLM Inference Throughput Under Power Constraints", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose this issue and explain the behavior despite high utilization, considering power management concepts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3332", "title": "Power-Efficient LLM Inference on Google TPU v5e", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use TPU v5e power caps, DVFS P-states, and the CMOS power equation to minimize energy per token while meeting a 50 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3333", "title": "Energy-Efficient LLM Inference on AMD MI300X Cluster", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize tokens per Joule while meeting throughput and latency targets using TDP, power caps, DVFS, and P≈CV²f?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3334", "title": "Google TPU v5e Thermal Limits and Sustained Performance Recall", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What thermal mechanisms limit TPU v5e burst vs sustained BF16 performance, and how would higher ambient temperature affect them?", "chain_ids": ["cloud-chain-auto-secondary-015-44"], "chain_positions": {"cloud-chain-auto-secondary-015-44": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-44": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3335", "title": "AMD MI300X Thermal Throttling Analysis for LLM Inference", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does MI300X inference drop to about 60% of peak after sustained load, and what telemetry would indicate thermal throttling?", "chain_ids": ["cloud-chain-auto-secondary-015-45"], "chain_positions": {"cloud-chain-auto-secondary-015-45": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3336", "title": "Designing for Sustained Performance on NVIDIA A100: Thermal Management for Large-Scale AI", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design thermal management for an A100 inference cluster at up to 30°C ambient to sustain high FP16 throughput without throttling?", "chain_ids": ["cloud-chain-auto-secondary-015-46"], "chain_positions": {"cloud-chain-auto-secondary-015-46": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-46": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3337", "title": "Diagnosing Performance Variability on Google TPU v5e due to Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose whether TPU training slowdowns after an hour are caused by thermal throttling and ambient temperature effects?", "chain_ids": ["cloud-chain-auto-secondary-015-44"], "chain_positions": {"cloud-chain-auto-secondary-015-44": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-44": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3338", "title": "Cloud LLM Thermal Design with AMD MI300X", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design cooling and throttling mitigation for a two-MI300X server node to maximize sustained training performance and meet SLAs?", "chain_ids": ["cloud-chain-auto-secondary-015-45"], "chain_positions": {"cloud-chain-auto-secondary-015-45": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3339", "title": "NVIDIA A100 Thermal Throttling in Cloud Inference at Scale", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal-aware orchestration, power, and cooling strategy would you use to prevent A100 throttling under 35°C rack hotspots?", "chain_ids": ["cloud-chain-auto-secondary-015-46"], "chain_positions": {"cloud-chain-auto-secondary-015-46": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-46": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3340", "title": "Optimizing LLM Inference Energy on AMD MI300X", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might MI300X LLM inference have low tokens/s and high energy cost despite massive TFLOPS, and how do energy-aware operators help?", "chain_ids": ["cloud-chain-auto-secondary-009-18"], "chain_positions": {"cloud-chain-auto-secondary-009-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3341", "title": "Energy-Aware MoE Inference on AMD MI300X", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an MI300X MoE inference pipeline to maximize throughput within a power envelope using energy-aware operator selection?", "chain_ids": ["cloud-chain-auto-secondary-009-18"], "chain_positions": {"cloud-chain-auto-secondary-009-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3342", "title": "Optimizing Energy Efficiency for LLM Inference on NVIDIA A100", "topic": "energy-per-operation", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce energy per query for an A100 MoE LLM whose expert weights are frequently loaded from HBM2e under a latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3343", "title": "NVIDIA A100 Cluster Power and Cooling Optimization", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize power, cooling, and PUE for 100 GPUs, and what carbon-aware scheduling strategies apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3344", "title": "Datacenter Architecture for H100-based AI Cluster with Carbon Efficiency Goals", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design power, cooling, and carbon-aware scheduling for an H100 training cluster targeting PUE 1.1 over a 5-year lifecycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3345", "title": "Diagnosing Unexpected Power Overages in an AMD MI300X Cluster", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What could be the root cause of these symptoms, and how would you methodically diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3346", "title": "Datacenter Efficiency: A100 Rack Design for Carbon-Aware ML in Cloud", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fit 16 A100s into a 10 kW rack budget with cooling, layout, carbon-aware scheduling, and 5-year carbon tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3347", "title": "NVIDIA H100 Rack Power and PUE Analysis", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you calculate whether 16 GPUs fit in a 15 kW rack and estimate grid power at PUE 1.6?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 1}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3348", "title": "Optimizing a Large-Scale A100 Cluster for Carbon-Aware Operations", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the bottlenecks and quantify the potential benefits of these changes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3349", "title": "Optimizing H100-powered AI Clusters for Carbon-Aware Datacenter Efficiency", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you size rack power and liquid cooling, then estimate operational carbon for an H100 datacenter at PUE 1.15?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3350", "title": "Optimizing Large Transformer Inference on Google TPU v5e: Attention and KV-Cache Scaling", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do KV-cache growth and O(N²) attention bottleneck a 70B Transformer on TPU v5e, and what attention or quantization changes would mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3351", "title": "Optimizing KV-Cache Memory on AMD MI300X for Large Language Models", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What formula and maximum sequence length apply at batch size 8 before KV-cache OOM when 140 GB of the 192 GB HBM holds weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3352", "title": "NVIDIA A100 Performance Bottleneck in Large Context LLM Inference with Multi-Head Attention", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does standard MHA cause memory and latency bottlenecks for 64k contexts on an A100, and what attention optimizations would you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3353", "title": "Scaling LLM Inference with Attention Variants", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What attention mechanism and context-scaling strategy would you use for 128k-token LLM inference, and how would you justify it quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3354", "title": "Scaling Attention for Long Contexts", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which attention modification would you choose for 128K-token inference, and how does it address memory and latency bottlenecks?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3355", "title": "Optimizing LLM Context with Attention Scaling on NVIDIA H100", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate MHA versus MQA or GQA for 128k-token LLM inference on H100, including memory, bandwidth, latency, and quality tradeoffs?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 3}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3356", "title": "Optimizing LLM Attention Scaling on AMD MI300X for Long Context", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose KV-cache bandwidth bottlenecks in MHA and quantify the benefit of switching to GQA, MQA, or sliding-window attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3357", "title": "Scaling Long-Context Attention on NVIDIA A100 for LLM Inference", "topic": "attention-scaling", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the system to handle the memory and compute requirements, considering techniques like grouped-query attention or sliding window attention?", "chain_ids": ["cloud-chain-auto-secondary-013-04"], "chain_positions": {"cloud-chain-auto-secondary-013-04": 4}, "chain_tiers": {"cloud-chain-auto-secondary-013-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3358", "title": "MoE Inference Optimization on NVIDIA A100", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory capacity and bandwidth bottlenecks arise for this single-GPU MoE design?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 0}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3359", "title": "Optimizing MoE Routing on NVIDIA H100 for Latency-Sensitive Inference", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are GPU MoE throughput and latency scaling poorly, and how would you mitigate routing, capacity, and memory-bandwidth bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3360", "title": "Scaling a 1 Trillion Parameter Mixture of Experts Model on NVIDIA A100s", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and size a cost-minimal MoE inference system for 10,000 QPS at 50 ms token latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3361", "title": "MoE Routing and H100 Capacity Planning for Large-Scale Inference", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce the MoE service's tail latency from expert load imbalance and HBM3 memory contention on H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3362", "title": "Optimizing MoE Routing on NVIDIA A100 for Extreme Scale", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks explain the 50 vs. 200 tokens/sec MoE throughput on A100s, and how would you quantify and mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3363", "title": "Estimating a Large Language Model's Fit on Google TPU v5e", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Without considering quantization or optimizations, what is the initial memory challenge when deploying this model on a single TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3364", "title": "LLM Deployment Feasibility on AMD MI300X", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a 70B FP16 LLM fit and run in real time on one MI300X, and what memory or bandwidth bottlenecks would you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3365", "title": "NVIDIA H100 Model Deployment Feasibility for Large Language Model", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a 175B FP16 LLM fit on one H100, what is the memory-bound token latency, and how many H100s are minimally required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3366", "title": "Estimating LLM Deployment on Google TPU v5e", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a single TPU v5e hold this 70B model plus activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3367", "title": "TPU v5e Split Architecture for Generation and Classification", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture would you recommend for optimizing performance and cost on the TPU v5e, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3368", "title": "Encoder-Decoder Tradeoffs on AMD MI300X for Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the inference compute, memory, and bandwidth costs of encoder-only, decoder-only, and encoder-decoder models on the MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3369", "title": "A100 Inference Optimization: Encoder-Decoder Tradeoffs for Real-time LLM Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the A100 tail latency and cost issues, and what architecture would you use for summarization plus response generation?", "chain_ids": ["cloud-chain-auto-secondary-016-22"], "chain_positions": {"cloud-chain-auto-secondary-016-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3370", "title": "H100-Optimized LLM: Encoder vs. Decoder Architecture Tradeoffs for Cost-Effective Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate encoder-only, decoder-only, and encoder-decoder options on H100s against 50 ms latency, throughput, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3371", "title": "A100 Inference Tradeoffs for Encoder-Only, Decoder-Only, and Encoder-Decoder Models", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the system costs and architectural tradeoffs, specifically leveraging the A100's capabilities, to make a recommendation for a low-latency, high-throughput service?", "chain_ids": ["cloud-chain-auto-secondary-016-22"], "chain_positions": {"cloud-chain-auto-secondary-016-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-016-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3372", "title": "H100 Deployment Strategy: Encoder-Decoder Tradeoffs for LLM Cost Optimization", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture (encoder-only, decoder-only, or encoder-decoder) would you choose for query understanding and generation, and how would you quantify bottlenecks?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3373", "title": "Encoder-Decoder Architecture Tradeoffs on Google TPU v5e for Real-time Inference", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture is best for low-latency conversational AI on TPU v5e, and how do latency, throughput, HBM, and cost trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3374", "title": "H100 Performance Bottlenecks with Structured vs. Unstructured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does high pruning sparsity fail to speed up inference on H100, and how do structured and unstructured pruning differ?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 1}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3375", "title": "Optimizing LLM Deployment with Structured Sparsity on Google TPU v5e", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you change from unstructured to structured pruning on TPU v5e to reach 2x speedup, and what utilization gains would you target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3376", "title": "Optimizing Large Language Model Deployment on AMD MI300X with Structured Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you approach structured versus unstructured pruning, and what sparsity patterns would best exploit sparse compute hardware?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 3}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3377", "title": "Optimizing LLM Deployment on NVIDIA A100: Structured vs. Unstructured Pruning for Latency and Throughput", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which pruning approach and sparsity patterns would you use to keep accuracy within 1% while improving latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3378", "title": "70B-to-7B Distillation HBM and Throughput Tradeoffs", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill a 70B teacher into a 7B student while balancing logit or feature distillation, HBM use, and throughput?", "chain_ids": ["cloud-chain-auto-secondary-015-32"], "chain_positions": {"cloud-chain-auto-secondary-015-32": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3379", "title": "Optimizing Real-time Inference with Knowledge Distillation on NVIDIA A100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill the image segmentation teacher into an A100-friendly student for 10,000 RPS at 50 ms p99, and why choose distillation over pruning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3380", "title": "H100 Memory Optimization for Knowledge Distillation Logit Matching", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What maximum batch size fits for logit matching with 20 GB free HBM3, sequence length 2048, vocab 50,000, and what logit precision would you use?", "chain_ids": ["cloud-chain-auto-secondary-015-30"], "chain_positions": {"cloud-chain-auto-secondary-015-30": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3381", "title": "Diagnosing ResNet-50 Student Inference Latency on TPU v5e", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose why the ResNet-50 student is 20% slower than expected on TPU v5e at batch size 64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3382", "title": "Optimizing Large Language Models with Knowledge Distillation on AMD MI300X", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose between logit matching and feature distillation, and when is distillation preferable to pruning on MI300X for sub-50ms latency?", "chain_ids": ["cloud-chain-auto-secondary-015-31"], "chain_positions": {"cloud-chain-auto-secondary-015-31": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3383", "title": "Optimizing Large Language Model Deployment on H100 via Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you size the student model and choose logit or feature distillation to balance near-teacher accuracy, latency, and cost?", "chain_ids": ["cloud-chain-auto-secondary-015-30"], "chain_positions": {"cloud-chain-auto-secondary-015-30": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3384", "title": "Optimizing Real-time LLM Inference on Google TPU v5e via Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the TPU v5e bottleneck for the 15B BF16 teacher, and how would you distill a student to hit 50 ms while retaining 98% accuracy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3385", "title": "LLM Distillation for High-Throughput Inference on AMD MI300X", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you serve the 7B distilled LLM on MI300X using quantization, sparse compute, and hardware-tailored distillation, and when would it beat pruning?", "chain_ids": ["cloud-chain-auto-secondary-015-31"], "chain_positions": {"cloud-chain-auto-secondary-015-31": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3386", "title": "H100 Memory Bottleneck in LLM Element-wise Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are the H100 element-wise operations memory-bound, and how would you fuse them to reduce global memory traffic and kernel launch overhead?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 3}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3387", "title": "Optimizing Transformer Inference on Google TPU v5e via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use operator fusion on TPU v5e to cut memory-bound element-wise latency, and how would you identify, implement, and validate it?", "chain_ids": ["cloud-chain-auto-secondary-004-25"], "chain_positions": {"cloud-chain-auto-secondary-004-25": 1}, "chain_tiers": {"cloud-chain-auto-secondary-004-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3388", "title": "Optimizing Memory-Bound Operations via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse the three memory-bound element-wise operations to reduce memory traffic and kernel launch overhead?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 4}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3389", "title": "TPU Kernel Fusion for Memory-Bound Element-wise Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse the ReLU and bias addition kernels, and what performance improvement would you expect?", "chain_ids": ["cloud-chain-auto-secondary-004-25"], "chain_positions": {"cloud-chain-auto-secondary-004-25": 0}, "chain_tiers": {"cloud-chain-auto-secondary-004-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3390", "title": "Optimizing LLM Inference on AMD MI300X via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify and fuse memory-bound element-wise operators into custom ROCm kernels on MI300X to maximize HBM3 bandwidth and cut latency?", "chain_ids": ["cloud-chain-auto-secondary-004-25"], "chain_positions": {"cloud-chain-auto-secondary-004-25": 2}, "chain_tiers": {"cloud-chain-auto-secondary-004-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3391", "title": "H100 LLM Inference Optimization: Kernel Fusion for Memory-Bound Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify, implement, and validate operator fusion for these memory-bound LLM ops to reduce kernel launches and global memory traffic?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 5}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3392", "title": "H100 Graph Optimization Strategies for LLMs", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do AOT compilation, operator lowering, and constant folding improve LLM inference on an H100?", "chain_ids": ["cloud-chain-auto-023-04"], "chain_positions": {"cloud-chain-auto-023-04": 1}, "chain_tiers": {"cloud-chain-auto-023-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3393", "title": "Custom Activation Host-Device Bottleneck", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might a custom activation underperform on this accelerator despite low HBM use, and how should operator lowering be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3394", "title": "MI300X Operator Fusion for LLM Memory Bandwidth Optimization", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If fusion cuts memory traffic by 30% for a strictly memory-bound operator, what theoretical execution-time reduction should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3395", "title": "Optimizing LLM Inference on NVIDIA A100 with Custom AOT Graph Compiler", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an AOT graph compiler for an LLM to remove dynamic dispatch overhead and improve HBM utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3396", "title": "Optimizing Large Model Inference on H100 with AOT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an AOT graph compiler for H100 LLM inference using operator lowering and constant folding while balancing FP16 compute and HBM3 bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3397", "title": "Optimizing 7B LLM Inference on Google TPU v5e via AOT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do compilation techniques help meet a 50 ms/token decode target despite memory-bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3398", "title": "Optimizing LLM Inference on AMD MI300X: Memory-Bound Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you reschedule operators to improve memory reuse, parallel execution, and layer fusion for the 70B LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3399", "title": "NVIDIA A100 Operator Scheduling for LLM Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule operators and handle dynamic batching on GPUs to minimize latency and maximize throughput for LLM inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3400", "title": "Optimizing Transformer Operator Scheduling on NVIDIA A100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an operator schedule that reduces Transformer inference latency and peak HBM memory use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3401", "title": "H100 Transformer Inference Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare Strategies A and B on the GPU for latency, throughput, and memory use, and which hardware limits matter most?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3402", "title": "Optimizing LLM Inference Scheduling on Google TPU v5e", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule LLM operators to minimize execution time under the 16 GB HBM and 1.6 TB/s bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3403", "title": "Optimizing LLM Inference on AMD MI300X via Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize operator execution order for the 70B LLM to reduce memory pressure and improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3404", "title": "A100 Transformer Scheduling for Memory and Throughput", "topic": "operator-scheduling", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule operators to manage dependencies, reuse memory, fuse kernels, and reduce inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3405", "title": "Optimizing IO-Aware Attention on AMD MI300X for Large Sequence Models", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you adapt FlashAttention-style tiling and online softmax to a GPU for long-sequence LLM inference?", "chain_ids": ["cloud-chain-auto-014-05"], "chain_positions": {"cloud-chain-auto-014-05": 1}, "chain_tiers": {"cloud-chain-auto-014-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3406", "title": "H100 Memory Optimization for IO-Aware Attention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compute the H100 standard-attention sequence limit, derive FlashAttention's HBM footprint, and explain the bandwidth gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3407", "title": "Diagnosing IO-Bound FlashAttention on TPU v5e", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why 64x64 FlashAttention tiles are IO-bound on TPU v5e, and which metrics would guide you?", "chain_ids": ["cloud-chain-auto-014-08"], "chain_positions": {"cloud-chain-auto-014-08": 0}, "chain_tiers": {"cloud-chain-auto-014-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3408", "title": "Optimizing Large Language Model Attention on AMD MI300X with IO-Aware Tiling", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design FlashAttention-style tiled attention with online softmax on MI300X, and quantify the memory and throughput benefits?", "chain_ids": ["cloud-chain-auto-014-05"], "chain_positions": {"cloud-chain-auto-014-05": 2}, "chain_tiers": {"cloud-chain-auto-014-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3409", "title": "FlashAttention on A100: Optimizing for Sequence Length and Throughput", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign self-attention with FlashAttention-style tiling and online softmax to support long sequences on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3410", "title": "Designing IO-Aware Attention on NVIDIA A100: FlashAttention and Online Softmax for Large Language Models", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use FlashAttention-style tiling and online softmax on the A100 for 16,384-token, 8,192-hidden LLM attention?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3411", "title": "Optimizing LLM Inference Latency with Speculative Decoding on NVIDIA H100", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does draft-verify speculative decoding use H100 bandwidth and FP16 compute, and how would you choose the draft model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3412", "title": "TPU v5e Speculative Decoding Performance Analysis", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate TPU v5e draft model choices for speculative decoding to minimize streaming generation latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3413", "title": "High-Throughput Speculative Decoding on AMD MI300X for Low-Latency LLM Inference", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect speculative decoding on MI300X to co-locate draft and verify models, select drafts dynamically, and maximize acceptance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3414", "title": "Optimizing Speculative Decoding on NVIDIA A100 for LLM Inference", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and improve A100 speculative decoding when acceptance sometimes falls below 40% and draft overhead is high?", "chain_ids": ["cloud-chain-auto-025-09"], "chain_positions": {"cloud-chain-auto-025-09": 1}, "chain_tiers": {"cloud-chain-auto-025-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3415", "title": "Optimizing Speculative Decoding Latency on NVIDIA A100 for LLMs", "topic": "speculative-decoding", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What bottleneck likely keeps 70B speculative decoding at 150 ms per token on A100, and what quantifiable fix would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3416", "title": "Analyzing Data Parallelism Bottlenecks on Google TPU v5e", "topic": "data-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does FSDP data-parallel ResNet-200 training on TPU v5e degrade beyond 64 devices, and how would you improve scaling?", "chain_ids": ["cloud-chain-auto-013-09"], "chain_positions": {"cloud-chain-auto-013-09": 2}, "chain_tiers": {"cloud-chain-auto-013-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3417", "title": "H100 Tensor Parallelism and Communication Overlap", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What communication challenges does column-partitioned tensor parallelism create on H100s, and how would you overlap communication with compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3418", "title": "Optimizing Large Language Model Inference with Tensor Parallelism on Google TPU v5e", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design TPU v5e tensor parallelism for a 175B LLM to overlap communication and computation and reduce latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3419", "title": "Scaling a 500B Parameter LLM with Hybrid Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you combine tensor and pipeline parallelism across GPUs for a 500B LLM to balance memory, communication, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3420", "title": "Diagnosing Tensor Parallelism Bottlenecks on NVIDIA GPUs for LLMs", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose tensor-parallel underperformance characterized by low compute utilization and heavy inter-GPU communication across 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3421", "title": "TPU v5e Tensor Sharding for 100B LLM Inference", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you shard a 100B LLM across TPU v5e devices with tensor parallelism to minimize inference latency and communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3423", "title": "Optimizing Pipeline Parallelism on Google TPU v5e", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you split the model across TPU pipeline stages and use micro-batching and interleaving to reduce bubble overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3424", "title": "Optimizing LLM Pipeline Parallelism on AMD MI300X with Micro-batching and Interleaving", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do pipeline bubbles arise with 8 stages at batch size 1, and how would micro-batching and interleaving improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3425", "title": "Optimizing Micro-Batch Size for a 4-Stage GPU Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What micro-batch size maximizes throughput for this 4-stage pipeline with batch 64 and a 16-sample memory cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3426", "title": "Diagnosing TPU Pipeline Stalls in Large Language Models", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of these unexpected stalls and massive pipeline bubbles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3427", "title": "Choosing Micro-Batch Count for an 8-Stage A100 Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What micro-batch count m brings the bubble fraction below 10% while keeping the 1F1B activation memory footprint inside the 80 GB HBM budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3428", "title": "Optimizing LLM Pipeline Parallelism on NVIDIA H100 Cluster", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you schedule an 8-stage H100 pipeline for a terabyte-scale MoE model to minimize bubbles and exploit compute and bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3429", "title": "Pipeline Bubble on a Heterogeneous TPU v5e Stage Layout", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For 8 TPU stages with m=4 and a 175 ms final stage, what bubble fraction and schedule-only change get within 10% of ideal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3431", "title": "3D Parallelism Sizing — Adam State + Activations on A100s", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do DP, TP, and PP affect per-GPU memory, and what minimum sharding product makes the 70B model fit?", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 0}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3432", "title": "Optimizing Frontier Model Training with 3D Parallelism on NVIDIA H100", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why might 3D-parallel training of a 1T-parameter model on H100s stall during pipeline communication despite high HBM bandwidth?", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 2}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3433", "title": "Optimizing 3D Parallelism for Frontier LLM Training on AMD MI300X", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you combine data, tensor, and pipeline parallelism on the cluster for a 1T-parameter LLM with a 2 PB training footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3434", "title": "Optimizing Frontier Model Training with 3D Parallelism on NVIDIA A100", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design 3D parallel training for a 500B FP16 model on 8-GPU A100 nodes while managing memory and communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3435", "title": "Designing 3D Parallelism for Frontier Models on NVIDIA H100", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you integrate data, tensor, and pipeline parallelism on H100s to train trillion-parameter models efficiently?", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 3}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3437", "title": "Designing Efficient Gradient Synchronization for Large-Scale LLM Training on AMD MI300X", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a gradient synchronization strategy for a 1T-parameter LLM data parallelism to minimize network bottlenecks while preserving convergence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3439", "title": "Optimizing Gradient Synchronization for LLM Training on H100 Clusters", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which gradient synchronization strategy would you use for 175B-parameter training on 8 H100s in one NVLink server versus two 4-GPU servers over 200 Gb/s Ethernet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3442", "title": "NVIDIA H100 Multi-tenancy with MIG for Resource Sharing", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does MIG on a GPU enable multi-tenant GPU sharing, and what resource guarantees and isolation does each instance provide?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 0}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3443", "title": "Google TPU v5e: Multi-Tenant Scheduling and Resource Contention", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are the RL and LLM jobs interfering on shared TPU v5e devices, and how should the scheduler prevent preemption and throughput instability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3445", "title": "A100 MIG Instance Utilization and Concurrency for ML Inference", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For a 1g.10gb A100 MIG slice, how many 20 TFLOPS and 100 GB/s inference requests fit concurrently, and what is the total across seven slices?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3446", "title": "Multi-Tenant H100 GPU Scheduling for Cloud ML", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule and partition GPUs for mixed distributed training and latency-critical inference while ensuring utilization, fairness, and SLOs?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 3}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3447", "title": "TPU v5e Multi-Tenant Scheduling and Preemption", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule the 2 GB, 20 TFLOPS inference job onto the occupied TPU v5e while minimizing disruption, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3449", "title": "Inference Tail-Latency vs Training Throughput Contention on TPU v5e", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you eliminate the 9 ms ICI AllReduce tail that blows the inference SLO on a shared TPU v5e pod, and what does it cost the training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3450", "title": "Optimizing Collective Communication on AMD MI300X", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "When would ring AllReduce be preferred over tree AllReduce for gradient sync, and how do message size and bandwidth affect that choice?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3451", "title": "NVIDIA Collective Communication: Ring vs. Tree AllReduce Latency", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 10 GB AllReduce over 8 GPUs with 600 GB/s bandwidth, what are the theoretical minimum communication times for bandwidth-optimal ring and tree algorithms, and why are they similar?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3452", "title": "Distributed LLM Training on AMD MI300X: Interconnect Bottlenecks", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might a fat-tree interconnect outperform a torus or Dragonfly topology for MI300X LLM all-reduce, and what topology bottlenecks limit scalability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3453", "title": "Designing High-Performance Interconnects for Large-Scale A100 Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What interconnect architecture would you choose for 128 GPUs with communication-heavy training, and how would NVLink, NVSwitch, InfiniBand, and topology factor in?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3454", "title": "Optimizing MoE LLM Training on H100s: Diagnosing and Fixing Interconnect Bottlenecks", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix the MoE expert-dispatch bottleneck on 64 GPUs, and quantify the expected throughput improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3455", "title": "Optimizing LLM Training on AMD MI300X Clusters: Mitigating Network Bandwidth Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate network bandwidth bottlenecks causing underutilized MI300X accelerators in multi-node LLM training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3456", "title": "Optimizing Distributed ML Training with Network Bandwidth Constraints on NVIDIA A100s", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the data flow and communication patterns to mitigate network bandwidth bottlenecks, analyze the communication-computation ratio, and model the bandwidth cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3457", "title": "Optimizing Distributed Training on NVIDIA H100: A Network Bandwidth Challenge", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate whether the 16 GB gradient bucket is network-bound and reduce the 400 Gbps bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3458", "title": "Optimizing Network-Bound LLM Training on AMD MI300X Cluster", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and address all-reduce bottlenecks for a 70B FP16 LLM with 100 GB/s effective inter-node bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3459", "title": "Optimizing Distributed Training under Network Bandwidth Bottlenecks on A100 Clusters", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically analyze, quantify, and mitigate the network bandwidth bottleneck for this MoE model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3460", "title": "Optimizing Distributed LLM Training on NVIDIA H100 Network Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you model bisection bandwidth, communication-computation ratio, and cost impact for 64 H100s stalled on AllReduce, and what optimizations would you take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3461", "title": "Optimizing Inter-GPU Communication for Distributed LLM Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What network and data-movement architecture would you use so multi-node LLM training avoids inter-GPU communication bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3462", "title": "Optimizing Distributed LLM Training with RDMA on NVIDIA A100", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a RoCE/RDMA network and communication strategy for a 100 GB all-reduce so GPUs stay compute-bound?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 2}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3463", "title": "Comparing High-Performance Interconnects for Distributed Training on Google TPU v5e", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do TCP/IP Ethernet and InfiniBand RDMA compare for TPU v5e all-reduce in latency, CPU overhead, memory copies, and training throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3464", "title": "Optimizing Distributed Training with RDMA on AMD MI300X", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use RDMA, zero-copy networking, and kernel bypass to make gradient synchronization high-throughput and low-latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3465", "title": "Optimizing Distributed Training Throughput with RoCE on GPUs", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reconfigure the RoCE v2 communication stack so the 64 GPUs use RDMA and GPUDirect for the 100 GB all-reduce instead of TCP/IP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3466", "title": "Optimizing Distributed Training Communication with RDMA on NVIDIA H100", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you leverage RDMA, InfiniBand verbs, zero-copy networking, and kernel bypass to maximize communication throughput for an all-reduce operation?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 3}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3467", "title": "TPU v5e Inference Load Balancing Basics", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What basic load balancing algorithms would you use to distribute variable inference requests across TPU v5e instances?", "chain_ids": ["cloud-chain-auto-secondary-013-32"], "chain_positions": {"cloud-chain-auto-secondary-013-32": 0}, "chain_tiers": {"cloud-chain-auto-secondary-013-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3468", "title": "Optimizing LLM Inference Routing on AMD MI300X Cluster", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you replace round-robin with dynamic, load-aware routing to reduce tail latency and balance utilization across the MI300X servers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3469", "title": "Designing a High-Throughput Inference Load Balancer for NVIDIA A100 Clusters", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design request routing for this service considering consistent hashing, weighted round-robin, and dynamic traffic management to meet a strict P99 latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3470", "title": "LLM Inference Load Balancing on TPU Fleet", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route and load-balance 50,000 QPS across TPU inference servers to hit 100 ms latency during updates and failures?", "chain_ids": ["cloud-chain-auto-secondary-013-32"], "chain_positions": {"cloud-chain-auto-secondary-013-32": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-32": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3471", "title": "LLM Inference Routing on AMD MI300X Cluster", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you load-balance and route 10,000 RPS across 500 instances while handling variable requests, updates, and failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3472", "title": "NVIDIA A100 Inference: Load Balancing for Distributed ML Systems", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate WRR versus locality-aware consistent hashing for 10 A100s across two data centers to handle traffic spikes and GPU failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3473", "title": "LLM Inference Scaling: H100 Load Balancing & Routing for P99 Latency", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which request routing policy would you use above the 500-GPU floor to meet the 100ms P99 latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3474", "title": "Optimizing Inference Routing for High-Throughput TPU v5e Farms", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you replace round-robin with adaptive routing for TPU v5e inference to improve utilization and latency under fluctuating load?", "chain_ids": ["cloud-chain-auto-secondary-013-32"], "chain_positions": {"cloud-chain-auto-secondary-013-32": 1}, "chain_tiers": {"cloud-chain-auto-secondary-013-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3475", "title": "TPU v5e Network Congestion Control", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Google TPU v5e manage network congestion, and how does its approach differ from ECN, PFC, and DCQCN in RoCE GPU clusters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3476", "title": "Congestion Control in AMD MI300X GPU Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do incast congestion and weak flow control stall all-reduce jobs, and how do ECN, PFC, and DCQCN mitigate this in RoCEv2?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 1}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3477", "title": "Mitigating Incast Congestion in GPU Clusters for Distributed ML Training", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you mitigate severe incast congestion during all-reduce using ECN, PFC, DCQCN, and intelligent flow scheduling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3478", "title": "H100 Cluster Congestion Control with DCQCN", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you apply DCQCN to reduce incast during 128-H100 all-reduce, and estimate per-H100 network throughput?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 0}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3479", "title": "Evaluating Congestion Control Strategies in an A100 GPU Cluster", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which proposal would you choose for the 512-GPU incast problem, and what are the trade-offs of DCQCN versus PFC/ECN with flow scheduling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3480", "title": "H100 Cluster Congestion Control and Network Optimization for Distributed ML", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use ECN, PFC, DCQCN-style control, and adaptive scheduling to fix tail-latency spikes in H100 collectives?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 3}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3481", "title": "Optimizing All-Reduce for LLM Training on AMD MI300X Cluster with Congestion Control", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you identify and resolve MI300X all-reduce incast congestion when standard ECN and PFC are insufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3482", "title": "NVIDIA A100 Inference Server Cold Start Analysis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the FP32 and FP16 cold-start model-loading times from the 10 GB/s NFS onto the GPU, and what is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3483", "title": "Scaling LLM Inference on NVIDIA H100 for Real-time Applications", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design H100 LLM inference serving for 1000 QPS and 50 ms p99, including loading, batching, autoscaling, and cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3484", "title": "Scaling High-Throughput LLM Inference on Google TPU v5e", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design TPU v5e LLM inference serving to handle unpredictable traffic, cold starts, model loading, batching, and autoscaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3485", "title": "MI300X LLM Serving Latency Spikes in Staging", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might MI300X LLM inference show staging-only latency spikes despite strong hardware, and how would you diagnose and fix the bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3486", "title": "A100 Large Model Deployment Throughput", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the theoretical maximum inferences per second on one GPU if each inference moves 15 GB through a 2.0 TB/s HBM link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3487", "title": "Real-time LLM Deployment on Google TPU v5e with MLOps", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an end-to-end MLOps pipeline to ensure TPU v5e hardware constraints are validated automatically and training-serving consistency is maintained?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3488", "title": "Optimizing ML Workloads on Kubernetes with Google TPU v5e", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Kubernetes device plugins, node affinity, and scheduling to run TPU v5e training efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3489", "title": "Kubernetes Orchestration for High-Performance ML Training with NVIDIA A100", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the Kubernetes configuration, including GPU device plugins, node affinity, and job scheduling policies, to maximize utilization and minimize turnaround times?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3490", "title": "Kubernetes Pod Placement and Resource Allocation for Large Language Model Training on NVIDIA H100s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you configure Kubernetes pods, GPU resources, and node affinity for an 8-H100 distributed LLM training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3491", "title": "Diagnosing Underutilized Google TPU v5e in Kubernetes ML Training", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely root cause of the low TPU utilization and the OOM errors, and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3492", "title": "Kubernetes Orchestration for LLM Training with AMD MI300X", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you build a Kubernetes training platform for MI300X LLM workloads with ROCm devices, topology-aware affinity, fair scheduling, and low fragmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3493", "title": "Optimizing Distributed ML Training on Kubernetes with A100 GPUs", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule a 16-A100 distributed training job onto only A100-80GB nodes using Kubernetes device plugins, node affinity, and batch scheduling?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3494", "title": "Optimizing MoE Training on Kubernetes with Google TPU v5e", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose TPU v5e underutilization and HBM issues for a distributed MoE model, and optimize the Kubernetes deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3495", "title": "Optimizing LLM Training on Kubernetes with AMD MI300X", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the MI300X LLM training job below 50% GPU utilization, and what concrete Kubernetes and data-pipeline fixes would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3496", "title": "Optimizing Distributed ML Training on Kubernetes with NVIDIA A100s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose low GPU utilization and implement Kubernetes/NVIDIA scheduling strategies for topology-aware distributed training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3497", "title": "A100 Model Deployment: Canary Release with Latency Budget", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many NVIDIA A100 GPUs would you need to provision for the canary cluster to handle its allocated traffic while respecting the performance improvements and latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3498", "title": "Diagnosing Latency Spikes and OOM During Canary Rollout on NVIDIA H100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically diagnose the root cause of these seemingly contradictory symptoms (low utilization and OOM) and stabilize the system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3499", "title": "Progressive Rollout of a Large Language Model on Google TPU v5e", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a progressive rollout strategy—incorporating shadow, canary, traffic splitting, and automated rollbacks—optimized for TPU v5e constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3500", "title": "Progressive Rollout of a Large Language Model on AMD MI300X", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you progressively roll out the 150 GB LLM on MI300X while monitoring performance and enabling rapid rollback?", "chain_ids": ["cloud-chain-auto-secondary-016-08"], "chain_positions": {"cloud-chain-auto-secondary-016-08": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3501", "title": "Progressive Rollout of a Large Language Model with Fast Rollback", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What progressive rollout strategy would you use to deploy the larger LLM with high availability and fast rollback?", "chain_ids": ["cloud-chain-auto-secondary-016-08"], "chain_positions": {"cloud-chain-auto-secondary-016-08": 1}, "chain_tiers": {"cloud-chain-auto-secondary-016-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3502", "title": "Optimizing Multi-Model RAG Latency on NVIDIA A100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the A100 RAG deployment to keep end-to-end latency under 500 ms while routing, batching, and minimizing inter-model transfer overhead?", "chain_ids": ["cloud-chain-auto-016-01"], "chain_positions": {"cloud-chain-auto-016-01": 5}, "chain_tiers": {"cloud-chain-auto-016-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3503", "title": "Choosing Checkpoint Frequency for a 1000-GPU LLM Training Run", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose checkpoint frequency, contents, and method for a weeks-long 1000-GPU LLM training run with frequent failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3504", "title": "TPU-Powered Recommendation System: Diagnosing Data & Concept Drift", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the recommendation quality drop on TPU v5e, distinguishing data drift, concept drift, and training-serving skew at scale?", "chain_ids": ["cloud-chain-auto-secondary-017-11"], "chain_positions": {"cloud-chain-auto-secondary-017-11": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3505", "title": "Real-time ML Output Drift Detection on AMD MI300X", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build real-time drift detection for 1024-D FP16 embeddings on MI300X while minimizing serving overhead?", "chain_ids": ["cloud-chain-auto-secondary-017-11"], "chain_positions": {"cloud-chain-auto-secondary-017-11": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3506", "title": "Real-time Data Drift Detection for Transformer Models on NVIDIA A100 Architectures", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement real-time PSI or KL drift detection for both serving architectures, and what are the trade-offs?", "chain_ids": ["cloud-chain-auto-secondary-017-12"], "chain_positions": {"cloud-chain-auto-secondary-017-12": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3507", "title": "Real-time LLM Data Drift Detection on NVIDIA H100", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you size, implement, and integrate real-time input-drift detection for an LLM with minimal latency impact?", "chain_ids": ["cloud-chain-auto-secondary-017-12"], "chain_positions": {"cloud-chain-auto-secondary-017-12": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3508", "title": "Real-time Recommendation System Drift on AMD MI300X", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and mitigate data drift, concept drift, and training-serving skew in the recommendation engine?", "chain_ids": ["cloud-chain-auto-secondary-017-11"], "chain_positions": {"cloud-chain-auto-secondary-017-11": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3509", "title": "Google TPU v5e Inference Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you gracefully degrade a TPU v5e LLM service running at 95% utilization to reduce latency and inference failures?", "chain_ids": ["cloud-chain-auto-secondary-015-17"], "chain_positions": {"cloud-chain-auto-secondary-015-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3510", "title": "Graceful Degradation for LLM Inference on AMD MI300X", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a graceful degradation ladder for MI300X fraud detection, including fallbacks, fail-safe/fail-operational modes, and QoS shedding?", "chain_ids": ["cloud-chain-auto-secondary-015-18"], "chain_positions": {"cloud-chain-auto-secondary-015-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3511", "title": "Graceful Degradation Anomaly on TPU v5e", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What likely explains the 15% latency increase and 2% relevance drop despite green health checks, and how would you diagnose it?", "chain_ids": ["cloud-chain-auto-secondary-015-17"], "chain_positions": {"cloud-chain-auto-secondary-015-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3512", "title": "Graceful Degradation for Large Language Models on AMD MI300X", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement a degradation ladder and model fallbacks to maintain LLM service functionality under extreme stress on MI300X accelerators?", "chain_ids": ["cloud-chain-auto-secondary-015-18"], "chain_positions": {"cloud-chain-auto-secondary-015-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3513", "title": "Graceful Degradation on NVIDIA A100: Adapting to Load Spikes", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design graceful degradation for an overloaded GPU fraud model using fallbacks, fail-operational modes, and QoS shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3514", "title": "Graceful Degradation for Real-time ML Inference on NVIDIA H100", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you keep the real-time inference service fail-operational under reduced capacity using model fallbacks and QoS shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3515", "title": "Graceful Degradation for Real-time Anomaly Detection", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a graceful degradation strategy with dynamic fallback ladders and QoS shedding?", "chain_ids": ["cloud-chain-auto-secondary-015-17"], "chain_positions": {"cloud-chain-auto-secondary-015-17": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-17": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3516", "title": "Graceful Degradation for GenAI on AMD MI300X", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you gracefully degrade the sparse MoE service under latency spikes or HBM errors while preserving reliability and resource efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3517", "title": "Identifying Model Extraction Risks on NVIDIA A100 Deployments", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Describe the concept of a model extraction attack in this context and explain how you would protect the A100-hosted API?", "chain_ids": ["cloud-chain-auto-secondary-015-22"], "chain_positions": {"cloud-chain-auto-secondary-015-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3518", "title": "Designing a Robust and Reliable ML System against Adversarial Attacks on Google TPU v5e", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the TPU v5e real-time ML system to remain reliable against poisoning and evasion attacks?", "chain_ids": ["cloud-chain-auto-secondary-015-23"], "chain_positions": {"cloud-chain-auto-secondary-015-23": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3519", "title": "Diagnosing Covert Adversarial Perturbations on LLMs within an NVIDIA A100 Cloud Fleet", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of these subtle output anomalies and mitigate a potential adversarial attack?", "chain_ids": ["cloud-chain-auto-secondary-015-22"], "chain_positions": {"cloud-chain-auto-secondary-015-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3520", "title": "Autonomous Systems: Adversarial Robustness and Reliability", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the image recognition service to resist adversarial inputs and prompt injection while preserving low-latency throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3521", "title": "TPU v5e Adversarial Defense Throughput Impact for Real-time Fraud Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What overhead does a 1 GFLOP sanitization step add to a 100 GFLOP TPU inference, and how would you manage its impact?", "chain_ids": ["cloud-chain-auto-secondary-015-23"], "chain_positions": {"cloud-chain-auto-secondary-015-23": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3522", "title": "Adversarial Robustness on AMD MI300X in Cloud Deployments", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design cloud MI300X defenses for adversarial images, prompt injection, and side-channel risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3523", "title": "Real-time Observability for H100-powered ML Inference Service", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build telemetry, alerts, dashboards, MTBF/MTTR tracking, and straggler detection for the H100 inference cluster?", "chain_ids": ["cloud-chain-auto-secondary-017-16"], "chain_positions": {"cloud-chain-auto-secondary-017-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3524", "title": "High-Performance ML Inference Monitoring: H100 Architecture Comparison", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor these two H100 architectures, including GPU metrics, stragglers, tail latency, and cross-GPU communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3525", "title": "Diagnosing Stragglers in Real-time LLM Inference on AMD MI300X Cluster", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the p99 latency spikes and quantify the reliability improvement from your fix?", "chain_ids": ["cloud-chain-auto-secondary-017-16"], "chain_positions": {"cloud-chain-auto-secondary-017-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3526", "title": "A100 Inference Performance Degradation: Monitoring & Anomaly Detection", "topic": "monitoring-observability", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a monitoring strategy to detect, diagnose, and minimize MTTR for these straggler requests?", "chain_ids": ["cloud-chain-auto-secondary-017-16"], "chain_positions": {"cloud-chain-auto-secondary-017-16": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3527", "title": "NVIDIA A100 Data Ingestion Bottleneck for Large Language Models", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is GPU utilization only 40-50%, and which ETL/data-loading optimizations would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3528", "title": "NVIDIA H100 and Feature Freshness in Online Feature Stores", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is point-in-time correctness, and how would you ensure it in the online feature store for this high-performance fraud model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3529", "title": "Optimizing Feature Freshness for Real-time Inference on Google TPU v5e", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you keep features under 500 ms fresh and point-in-time correct across online and offline stores for the TPU v5e recommender?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3530", "title": "Real-time Feature Freshness and Throughput for Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the recommendation feature pipeline to keep features fresh within 500 ms and point-in-time correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3531", "title": "Diagnosing Stale Features in Real-time H100 Model Deployment", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of these stale features impacting the real-time model given GPU under-utilization and a healthy feature store?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3532", "title": "Evaluating Feature Store Architectures for Real-time Fraud Detection on NVIDIA GPUs", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What feature-store architecture would you recommend for sub-10 ms fraud inference, and how do A and B trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3533", "title": "Real-time Feature Serving with H100-powered Embeddings", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the embedding generation and feature serving architecture to guarantee freshness, point-in-time correctness, and scalability to millions of QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3534", "title": "Real-time Fraud Detection with AMD MI300X: Feature Store & Point-in-Time Correctness", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design online and offline feature stores to feed the fraud model with millisecond-fresh, point-in-time-correct features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3535", "title": "Real-time Data Quality & Validation for H100 ML Systems", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time data quality gate before the training cluster to stop schema drift, invalid ranges, and anomalies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3536", "title": "Designing a High-Throughput Data Quality Gate for AMD MI300X", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What architecture and throughput calculation would ensure a 2 TB/hour data quality gate does not bottleneck the inference stream?", "chain_ids": ["cloud-chain-auto-secondary-015-28"], "chain_positions": {"cloud-chain-auto-secondary-015-28": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3537", "title": "Real-time Data Quality & Validation for A100-Accelerated Anomaly Detection", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a low-latency ingestion-time data quality gate so bad trading records never reach the A100 inference engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3538", "title": "Optimizing TPU v5e Utilization with Data Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose data-quality causes of TPU v5e throughput drops and quantify the gain after adding validation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3539", "title": "Ensuring Data Integrity for Petabyte-Scale LLM Training on AMD MI300X Clusters", "topic": "data-quality-validation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a continuous data quality framework for MI300X LLM training that handles schemas, contracts, gates, lineage, and anomalies?", "chain_ids": ["cloud-chain-auto-secondary-015-28"], "chain_positions": {"cloud-chain-auto-secondary-015-28": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-28": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3540", "title": "Optimizing Dataset Curation for Bias Mitigation on Google TPU v5e", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the TPU v5e's 16 GB HBM, BF16 compute, and 1.6 TB/s bandwidth shape active-learning selection and annotation to reduce dataset bias?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3541", "title": "Active Learning Labeling Bottleneck for Autonomous Driving Images", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With 10 GPUs, 100 annotators, and a 20% re-annotation rate, where is the bottleneck and how would you maximize labeled-image throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3542", "title": "TPU v5e Training Inefficiency: Diagnosing Data Curation Issues", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose whether active-learning selection, annotation quality, inter-annotator agreement, or bias is causing the training instability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3544", "title": "Centralized vs Distributed Labeling for Dataset Curation", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you compare centralized and distributed edge labeling for 10M images in 3 months while preserving IAA and limiting bias?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3545", "title": "Optimizing Active Learning for Large-Scale Model Training on Google TPU v5e", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you prioritize samples for annotation to maximize model performance gains while minimizing labeling costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3546", "title": "Real-time Anomaly Detection with AMD MI300X: Streaming Data Pipeline Analysis", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the stream-processing pipeline to meet a 50ms anomaly-alert SLA and avoid ingestion or feature bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3547", "title": "Real-time Anomaly Detection on High-Frequency Sensor Streams with NVIDIA H100", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the stream pipeline to process 10 GB/s telemetry and meet the 50 ms anomaly-detection SLA?", "chain_ids": ["cloud-chain-auto-secondary-015-14"], "chain_positions": {"cloud-chain-auto-secondary-015-14": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3548", "title": "Real-time Sensor Data Ingestion & ML Inference on AMD MI300X", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build the ingestion, feature-computation, and integration pipeline for 100,000 sensors emitting 1 KB every 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3549", "title": "Optimizing Real-time Inference on NVIDIA A100 for Streaming Data", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What batching or data-movement fix would address 50 ms P99 spikes and 30% GPU utilization in the 50k events/s anomaly stream?", "chain_ids": ["cloud-chain-auto-secondary-015-14"], "chain_positions": {"cloud-chain-auto-secondary-015-14": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3550", "title": "Real-Time Anomaly Detection on High-Throughput Sensor Data with Accelerators", "topic": "streaming-ingestion", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a resilient event-stream pipeline for 100M events/s with sub-100 ms anomaly alerts?", "chain_ids": ["cloud-chain-auto-secondary-015-14"], "chain_positions": {"cloud-chain-auto-secondary-015-14": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3551", "title": "Optimizing Data Storage for NVIDIA A100 Training: Parquet vs. TFRecord", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which storage format strategy—Parquet, TFRecord, or hybrid—would maximize training throughput for this recommendation dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3552", "title": "Optimizing Data Ingestion for H100-Powered Foundation Model Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign JSON-based storage and ingestion so H100s no longer spend 70% of training time waiting for data?", "chain_ids": ["cloud-chain-auto-secondary-007-03"], "chain_positions": {"cloud-chain-auto-secondary-007-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3553", "title": "Diagnosing HBM Underutilization in Sparse Feature Training on MI300X", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the training pipeline I/O-bound with zstd Parquet on S3, and what storage format strategy would fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3554", "title": "H100 Data Ingestion: Optimizing Storage for Large-Scale ML", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign storage and ingestion for the 10 PB image-feature dataset to maximize H100 utilization?", "chain_ids": ["cloud-chain-auto-secondary-007-03"], "chain_positions": {"cloud-chain-auto-secondary-007-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3555", "title": "Optimizing Large-Scale ML Data Storage for AMD MI300X", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What storage formats, compression, indexing, and tiering would you use to feed PB-scale multimodal data to MI300X training efficiently?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3556", "title": "Optimizing Data Loading for Large-Scale A100 Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the S3 TFRecord data-loading bottleneck that is starving the training cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3557", "title": "Optimizing Large-Scale Model Training Data I/O for NVIDIA H100 Clusters", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design formats, compression, and storage tiers to keep H100s fed during petabyte-scale multimodal training?", "chain_ids": ["cloud-chain-auto-secondary-007-03"], "chain_positions": {"cloud-chain-auto-secondary-007-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3558", "title": "Optimizing Large-Scale Foundation Model Training on TPU v5e: Data Efficiency & Compute Constraints", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the naive 10 TB training approach inefficient on TPU v5e, and what data-efficiency strategies would you use?", "chain_ids": ["cloud-chain-auto-secondary-015-38"], "chain_positions": {"cloud-chain-auto-secondary-015-38": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-38": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3559", "title": "Data-Efficient LLM Training Design on AMD MI300X", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data selection and processing strategy would you use on MI300X to raise ICR and avoid model collapse for multi-petabyte training?", "chain_ids": ["cloud-chain-auto-secondary-015-39"], "chain_positions": {"cloud-chain-auto-secondary-015-39": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-39": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3560", "title": "Diagnosing Data Efficiency and Model Collapse on NVIDIA H100", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and address the H100 data-efficiency issues causing diminishing returns, forgetting, and poor ICR?", "chain_ids": ["cloud-chain-auto-secondary-015-40"], "chain_positions": {"cloud-chain-auto-secondary-015-40": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3561", "title": "Optimizing Large-Scale Foundation Model Training with Data Efficiency on Google TPU v5e", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data selection strategy would train the petabyte-scale model on the accelerators within 100 TPU-days while maximizing Information-Compute Ratio (ICR) and avoiding collapse?", "chain_ids": ["cloud-chain-auto-secondary-015-38"], "chain_positions": {"cloud-chain-auto-secondary-015-38": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-38": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3562", "title": "Optimizing Data Pruning for Large Language Model Training on AMD MI300X", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you approach identifying and implementing an effective data pruning strategy, considering the unique characteristics of the hardware?", "chain_ids": ["cloud-chain-auto-secondary-015-39"], "chain_positions": {"cloud-chain-auto-secondary-015-39": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-39": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3563", "title": "Optimizing LLM Training on A100s: Coreset vs. Synthetic Data for the Data Wall Problem", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you evaluate coreset selection versus synthetic data generation on A100s, and would you choose one or a hybrid strategy?", "chain_ids": ["cloud-chain-auto-secondary-015-41"], "chain_positions": {"cloud-chain-auto-secondary-015-41": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3564", "title": "Optimizing LLM Training Data Efficiency on NVIDIA H100", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you leverage data efficiency techniques to address I/O bottlenecks and optimize the Information-Compute Ratio (ICR)?", "chain_ids": ["cloud-chain-auto-secondary-015-40"], "chain_positions": {"cloud-chain-auto-secondary-015-40": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3565", "title": "Optimizing Data Efficiency on Google TPU v5e: Addressing Model Collapse in Large-Scale Foundation Models", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What quantifiable data-efficiency plan would you use to improve the Information-Compute Ratio (ICR) and reduce model-collapse risk at hundreds-of-TB scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3566", "title": "Federated Learning on TPU v5e: Communication and Non-IID Challenges", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural choices would mitigate non-IID client drift and communication bottlenecks given the TPU v5e specs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3567", "title": "Federated Averaging Optimization on NVIDIA A100 for Non-IID Data", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the primary bottleneck (communication vs. computation on A100), propose an optimization strategy, and quantify its expected impact?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 2}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3568", "title": "Optimizing Federated Learning for Non-IID Edge Devices", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the federated learning protocol to handle communication limits, non-IID clients, cross-device privacy, and high-performance server-side aggregation?", "chain_ids": ["cloud-chain-auto-016-10"], "chain_positions": {"cloud-chain-auto-016-10": 3}, "chain_tiers": {"cloud-chain-auto-016-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3569", "title": "DP-SGD and Privacy Budget on AMD MI300X", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does the epsilon privacy budget affect DP-SGD noise and model accuracy on the MI300X?", "chain_ids": ["cloud-chain-auto-secondary-009-13"], "chain_positions": {"cloud-chain-auto-secondary-009-13": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3570", "title": "Optimizing DP-SGD on NVIDIA A100 for Federated Learning Privacy-Utility Tradeoff", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you calibrate clipping, noise, and epsilon accounting for DP-SGD with epsilon=8 on A100s while preserving medical model utility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3571", "title": "Designing a DP-SGD System with NVIDIA H100 for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the federated DP-SGD architecture on H100s, including epsilon accounting, noise calibration, batch sizing, and privacy monitoring?", "chain_ids": ["cloud-chain-auto-secondary-009-12"], "chain_positions": {"cloud-chain-auto-secondary-009-12": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3572", "title": "Optimizing DP-SGD on a 16 GB Accelerator with Privacy Budget Constraints", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you calibrate the noise scale (σ) per step to meet the privacy budget while considering hardware constraints?", "chain_ids": ["cloud-chain-auto-secondary-009-14"], "chain_positions": {"cloud-chain-auto-secondary-009-14": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3573", "title": "DP-SGD Misconfiguration on AMD MI300X: Utility Drop & Rapid Privacy Budget Consumption Diagnosis", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What root causes would you investigate for a 15% utility drop and rapid epsilon consumption in DP-SGD?", "chain_ids": ["cloud-chain-auto-secondary-009-13"], "chain_positions": {"cloud-chain-auto-secondary-009-13": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3574", "title": "DP-SGD Scaling on NVIDIA H100: Balancing Privacy and Performance", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does the DP-SGD noise scale and per-sample gradient computation impact utility, throughput, and resource utilization on the H100?", "chain_ids": ["cloud-chain-auto-secondary-009-12"], "chain_positions": {"cloud-chain-auto-secondary-009-12": 0}, "chain_tiers": {"cloud-chain-auto-secondary-009-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3575", "title": "DP-SGD Model Evaluation on Google TPU v5e: Optimizing Privacy-Utility Tradeoffs for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate client-side versus server-side DP-SGD on TPU v5e for privacy budget, utility, noise calibration, and throughput?", "chain_ids": ["cloud-chain-auto-secondary-009-14"], "chain_positions": {"cloud-chain-auto-secondary-009-14": 1}, "chain_tiers": {"cloud-chain-auto-secondary-009-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3576", "title": "DP-SGD Deployment on AMD MI300X: Budgeting Epsilon and Performance Impact", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you configure DP-SGD for the 70B LLM to meet epsilon=8, delta=1e-5 over 10 epochs while preserving throughput and quality?", "chain_ids": ["cloud-chain-auto-secondary-009-13"], "chain_positions": {"cloud-chain-auto-secondary-009-13": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3577", "title": "Optimizing DP-SGD on NVIDIA H100 for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and calibrate DP-SGD across H100s to meet epsilon=8.0 while preserving model utility and throughput?", "chain_ids": ["cloud-chain-auto-secondary-009-12"], "chain_positions": {"cloud-chain-auto-secondary-009-12": 2}, "chain_tiers": {"cloud-chain-auto-secondary-009-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3578", "title": "Fairness Metric Definitions on NVIDIA A100", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can you define and differentiate between 'demographic parity' and 'equalized odds' in this context?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 0}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3579", "title": "Fairness Evaluation and Root Cause Analysis on Large Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you investigate the generative model's toxic output disparities and use the H100 cluster to analyze and mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3580", "title": "Architecting a Real-time Fairness Evaluation System on Google TPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect real-time fairness monitoring on TPU v5e for demographic parity, equalized odds, and intersectional subgroups?", "chain_ids": ["cloud-chain-auto-secondary-014-32"], "chain_positions": {"cloud-chain-auto-secondary-014-32": 2}, "chain_tiers": {"cloud-chain-auto-secondary-014-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3581", "title": "Diagnosing Bias in a Large-Scale Model Deployed on NVIDIA A100 for Demographic Parity Failures", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the production demographic-parity gap in the loan approval model running on the A100?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 2}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3582", "title": "Fairness Evaluation of Large-Scale Models on Google TPU v5e", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you sample and process data on TPU v5e to compute subgroup TPR/FPR for equalized odds without degrading production SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3583", "title": "Fairness-Aware LLM Evaluation on AMD MI300X", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare sequential versus distributed fairness evaluation on MI300X while preserving deep intersectional subgroup analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3584", "title": "Optimizing Fairness-Aware Data Pipelines on NVIDIA H100 for LLM Content Moderation", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and optimize the data pipeline to improve subgroup fairness evaluation and latency on the H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3585", "title": "Environmental Impact Disclosure for LLM on NVIDIA H100", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What environmental footprint disclosures and energy or carbon guardrails should the H100 model card include?", "chain_ids": ["cloud-chain-auto-secondary-015-34"], "chain_positions": {"cloud-chain-auto-secondary-015-34": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3586", "title": "TPU v5e Deployment and Responsible AI Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How could TPU v5e hardware behavior contribute to subtle demographic bias and affect Responsible AI guardrails?", "chain_ids": ["cloud-chain-auto-secondary-015-35"], "chain_positions": {"cloud-chain-auto-secondary-015-35": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3587", "title": "Guardrail Latency and Compute Budget for Responsible AI", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the minimum average FP16 TFLOPS rate the GPU must sustain while executing the guardrail model to meet the remaining latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3588", "title": "Diagnosing Emergent Bias in LLM on NVIDIA H100 Cluster", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you investigate the low-frequency bias using model cards, impact assessments, red-teaming, guardrails, and H100 telemetry?", "chain_ids": ["cloud-chain-auto-secondary-015-34"], "chain_positions": {"cloud-chain-auto-secondary-015-34": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3589", "title": "Designing a Responsible AI Governance Framework for High-Volume Credit Scoring on Google TPU v5e", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify a Responsible AI governance service for continuous impact assessments, red-teaming, and dynamic model cards?", "chain_ids": ["cloud-chain-auto-secondary-015-35"], "chain_positions": {"cloud-chain-auto-secondary-015-35": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3590", "title": "Responsible LLM Deployment on AMD MI300X with Bias Mitigation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you deploy responsibly on MI300X under launch pressure, including guardrails, red-teaming, model cards, and accountability?", "chain_ids": ["cloud-chain-auto-secondary-015-36"], "chain_positions": {"cloud-chain-auto-secondary-015-36": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3591", "title": "Ethical LLM Deployment on H100s: Performance vs. Responsible AI", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you integrate guardrails, model cards, and impact assessments while meeting 10,000 RPS and 200 ms p99?", "chain_ids": ["cloud-chain-auto-secondary-015-34"], "chain_positions": {"cloud-chain-auto-secondary-015-34": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-34": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3592", "title": "Responsible AI Framework for a 175B Financial LLM", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Responsible AI governance framework for the biased 175B financial LLM?", "chain_ids": ["cloud-chain-auto-secondary-015-36"], "chain_positions": {"cloud-chain-auto-secondary-015-36": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3593", "title": "Cost-Performance Diagnosis: LLM Training on Google TPU v5e", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose TPU v5e underutilization and reduce LLM training TCO without sacrificing throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3594", "title": "Optimizing LLM Deployment TCO on NVIDIA A100: Spot vs. Reserved Instance Strategy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build a 2-year A100 cloud TCO analysis and choose between on-demand, spot, reserved, or hybrid deployment?", "chain_ids": ["cloud-chain-auto-018-03"], "chain_positions": {"cloud-chain-auto-018-03": 2}, "chain_tiers": {"cloud-chain-auto-018-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3595", "title": "NVIDIA H100 Deployment Strategy: Optimizing TCO for LLM Training", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What H100 cloud procurement and workload strategy would reduce TCO for 60% predictable training and 150% burst spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3596", "title": "Little's Law Throughput Bound on H100 Inference Cluster", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Using Little's Law, what steady-state throughput corresponds to 480 requests in flight and 120 ms mean service time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3597", "title": "M/M/1 Queue Utilization and Mean Wait on A100 Serving Node", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the mean queue wait at λ=90 req/s and µ=100 req/s, and why is 90% utilization dangerous for tail latency?", "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 1}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3598", "title": "Tail Latency Amplification in Multi-Stage H100 Pipeline", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Explain tail latency amplification and why does a 4-stage pipeline with 50ms per-stage p99 show ~220ms end-to-end p99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3599", "title": "M/M/c Queue Design for TPU v5e Serving Pool", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many TPU servers (c) do you need, and what utilization target should you set to maintain a p99 latency under 25ms?", "chain_ids": ["cloud-chain-auto-024-15"], "chain_positions": {"cloud-chain-auto-024-15": 1}, "chain_tiers": {"cloud-chain-auto-024-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3600", "title": "Head-of-Line Blocking in LLM Decode Queue on MI300X", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What head-of-line blocking is caused by the 80% short and 20% long FIFO mix, and how would you fix it?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 1}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3601", "title": "Queuing Model for Prefill-Decode Disaggregation on H100", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using queuing theory, how should you split 26 GPUs, and is that enough to keep both pools below 75% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3602", "title": "Token Bucket vs Leaky Bucket Rate Limiting for H100 Serving", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do token bucket and leaky bucket rate limiting compare for 1000 req/s 100 ms bursts on a 600 req/s cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3603", "title": "Work-Conserving Scheduler Analysis for Mixed Priority on MI300X", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a preemptive priority queue for premium and standard traffic, and can premium meet p99<100 ms?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 2}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3604", "title": "Queueing Theory Mastery: SLA-Driven Capacity Planning for H100 Fleet", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What initial G/G/c capacity plan would you test for p99 < 200ms and p999 < 1s at 50,000 req/s with service-time CV=2.0?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 3}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3605", "title": "PUE Calculation for H100 Datacenter Power Budget", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the PUE for 1000 H100s drawing 700 W each in a 1.4 MW facility, and what does it mean for useful compute cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3606", "title": "Cooling Bottleneck Analysis for Dense H100 GPU Rack", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the rack power or cooling bottleneck for 8 HGX H100 nodes, and what rack density is actually usable?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 2}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3607", "title": "Stranded Capacity Identification in Mixed GPU Datacenter", "topic": "datacenter-efficiency", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is capacity stranded in the 200-rack datacenter, and how much H100 compute is unusable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3608", "title": "Power Distribution Unit Redundancy for H100 Training Cluster", "topic": "datacenter-efficiency", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a 512-GPU 14-day training cluster, should the PDU architecture use 2N or N+1 redundancy, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3609", "title": "WUE and Water Efficiency Trade-off for Evaporative Cooling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With WUE=2.5 L/kWh and a 50,000 L/day cap, what maximum IT load is allowed and what constraint dominates?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 2}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3610", "title": "Carbon-Aware Scheduling for H100 Training Jobs", "topic": "sustainability-carbon-accounting", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a carbon-aware job scheduler across Virginia and Oregon to cut CO2 emissions by 50% given their spare capacities?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3611", "title": "Power Capping Impact on Training Throughput for MI300X", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why can a 750 W to 600 W MI300X power cap cause a non-linear throughput loss, and what trade-off does it imply?", "chain_ids": ["cloud-chain-auto-021-03"], "chain_positions": {"cloud-chain-auto-021-03": 1}, "chain_tiers": {"cloud-chain-auto-021-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3612", "title": "Rack-Level Power Provisioning for TPU v5e Pod", "topic": "datacenter-efficiency", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you provision rack, row, and facility power with N+1 redundancy for the 256-chip TPU v5e pod?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3613", "title": "DCiE and Total Facility Efficiency for Multi-Tenant GPU Cloud", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does DCiE=83% mean at PUE=1.2, how is it related to PUE, and is it competitive for a GPU cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3614", "title": "Nordic Liquid-Cooled 10 MW H100 Datacenter Design", "topic": "sustainability-carbon-accounting", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What cooling, power, and location architecture would minimize 5-year TCO for a 10 MW H100 training datacenter under the efficiency constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3615", "title": "Ring-AllReduce Bandwidth Calculation for H100 NVLink Cluster", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "For 7B FP32 gradients on 8 H100s over NVLink, how much data does ring-AllReduce transmit per GPU and what is the minimum sync time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3616", "title": "Ring vs Tree AllReduce Trade-off on Multi-Node H100 Cluster", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For synchronizing 52GB gradients on a 64-GPU cluster, does ring-AllReduce or tree-AllReduce perform better, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3617", "title": "Gradient Compression Accuracy-Bandwidth Trade-off on H100 Training", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which gradient compression strategy would you use to reduce the 45 s synchronization bottleneck, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3618", "title": "Gradient Synchronization Overlap with Computation on H100", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How does gradient bucketing overlap the 400 ms AllReduce with the 800 ms backward pass, and what speedup is possible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3619", "title": "AllReduce Algorithm Selection for Heterogeneous Bandwidth Topology", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What hierarchical AllReduce topology, time estimate, and optimization should the 256-GPU cluster use for 40 GB gradients?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 2}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3620", "title": "Gradient Staleness in Asynchronous SGD on A100 Cluster", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does τ=8 step gradient staleness in asynchronous SGD affect convergence, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3621", "title": "Reduce-Scatter and AllGather Decomposition on H100 NVLink", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For ZeRO Stage 2 on 8 H100s with a 16 GB gradient buffer, what are the Reduce-Scatter and AllGather volumes versus standard AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3622", "title": "Gradient Checkpointing Impact on Synchronization Frequency", "topic": "gradient-synchronization", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does gradient checkpointing change the compute-to-communication ratio and AllReduce overlap efficiency for the 30B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3623", "title": "TPU v5e AllReduce Ring Assumptions vs 2D Torus Bisection", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Diagnose why the measured TPU v5e AllReduce time is 2.1s when ring-AllReduce analysis predicts 800ms, and explain the interconnect property responsible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3624", "title": "NCCL AllReduce Tuning for H100 Multi-Rail InfiniBand", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you tune NCCL_ALGO, NCCL_PROTO, and message thresholds to improve 128-GPU AllReduce bandwidth from 85% to 95%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3625", "title": "Gradient Synchronization Fluency: AllReduce Taxonomy", "topic": "gradient-synchronization", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How do AllReduce, AllGather, Reduce-Scatter, Broadcast, and Reduce differ in communication volume per rank and training use case?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3626", "title": "Gradient Synchronization Mastery: Pipeline Parallelism + AllReduce Co-Design", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you schedule tensor, pipeline, and data-parallel communication for the 1024-GPU 3D-parallel 175B training system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3627", "title": "RDMA Write vs Send Semantics on InfiniBand for H100 Parameter Server", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Should the parameter server use RDMA Write or RDMA Send for gradient pushes, and what are the key differences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3628", "title": "RoCEv2 vs InfiniBand Latency and Congestion on H100 Training Cluster", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the new training cluster use native InfiniBand or RoCEv2 over 200GbE, and how do they compare for ML training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3629", "title": "Zero-Copy RDMA Registration and Memory Pinning for GPU Direct", "topic": "rdma-transport", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why might GPUDirect RDMA show 40% higher latency than CPU-path RDMA, and how would you fix the memory registration issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3630", "title": "Kernel Bypass Networking Overhead Analysis for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 100 MB gradient chunks over 25 GB/s links, when does RDMA's 3 µs latency matter versus TCP's 40 µs latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3631", "title": "InfiniBand Adaptive Routing for Hotspot Avoidance in H100 Fat-Tree", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you use adaptive routing to eliminate AllReduce hotspots in the 512-H100 fat-tree InfiniBand cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3632", "title": "RDMA QP (Queue Pair) Scalability Limits on Large H100 Clusters", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does one RDMA Queue Pair per peer fail to scale on 1024 H100s, and what connection strategy should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3633", "title": "GPUDirect Storage and RDMA for Checkpoint Loading on H100", "topic": "rdma-transport", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would GPUDirect Storage (GDS) meaningfully reduce 70B checkpoint save and restore times to NFS, and what improvement should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3634", "title": "RDMA Transport Fluency: IB Verbs API Key Operations", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you implement basic InfiniBand Verbs message passing, and what does zero-copy mean in that RDMA path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3635", "title": "RDMA Transport over Lossy Networks: RoCEv2 Congestion Collapse", "topic": "rdma-transport", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the RoCEv2 PFC storms during AllReduce, and how would you configure the network to prevent them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3636", "title": "InfiniBand vs Ethernet Cost-Performance Analysis for H100 Scale-Out", "topic": "rdma-transport", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which network should a 1024-H100 training cluster choose, HDR InfiniBand or 400GbE RoCEv2, given cost and performance trade-offs?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 2}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3637", "title": "RDMA Rail Fabric for 4096-GPU Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the network fabric to meet the 4096-GPU cluster's latency, all-reduce, fault-tolerance, and $50M budget requirements?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 3}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3638", "title": "InfiniBand Subnet Manager Failover for Training Cluster", "topic": "rdma-transport", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design InfiniBand Subnet Manager failover so a management-node crash does not kill the 7-day training job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3639", "title": "RDMA Read vs Write Semantics for KV Cache Transfer in LLM Disaggregation", "topic": "rdma-transport", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Would you use RDMA Read or RDMA Write to transfer the 2GB KV cache between prefill and decode H100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3640", "title": "P99 Latency Spike Diagnosis on H100 Inference Cluster", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and reduce the recommender service's P99 latency from 180ms to under the 50ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3641", "title": "Hedged Requests for LLM Inference on A100 Fleet", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What hedge timeout and safeguards would you use to bring P99 below 3s without causing a load cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3642", "title": "Load Balancing Strategy to Minimize P999 on TPU v5e", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the BERT-large service route requests so the 0.1% long sequences stop driving P999 to 800ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3643", "title": "P99 vs P999 Tradeoffs in Batching Strategy", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you change the dynamic batching policy to meet P999 < 200ms without collapsing GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3644", "title": "Tail Latency SLO Decomposition Across Microservices", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you set per-stage latency SLOs so the 5-stage pipeline meets a true 100ms end-to-end P99?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3645", "title": "Measuring and Setting Realistic P999 SLOs for Autoregressive Generation", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What SLA would you propose for 200-2000 token generations on the 70B A100 service, and how would you architect to meet it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3646", "title": "Flash Attention Tiling Strategy for H100 SRAM Constraints", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What Flash Attention tile sizes would you choose for seq_len=4096 and d_head=128, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3647", "title": "Flash Attention IO Complexity vs Standard Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What theoretical IO reduction should the team expect from FlashAttention-2 versus standard attention at seq_len=8192 on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3648", "title": "Flash Attention Backward Pass Memory Recomputation", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should you revert from Flash Attention to standard attention to reduce backward-pass FLOPs at seq_len=16384, and why?", "chain_ids": ["cloud-chain-auto-014-03"], "chain_positions": {"cloud-chain-auto-014-03": 2}, "chain_tiers": {"cloud-chain-auto-014-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3649", "title": "Multi-Query Attention Memory Savings vs Flash Attention", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For batch-1 decode, should you use MHA with Flash Attention, MQA, or GQA to minimize latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3650", "title": "Flash Attention with Variable-Length Sequences and Padding Masks", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you recover Flash Attention’s expected speedup for padded variable-length batches at seq_len=2048?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3651", "title": "Flash Attention v2 vs v3 Block Size Selection on H100", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you adopt Flash Attention v3 over v2 on H100 for the 70B model, and for which workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3652", "title": "Flash Attention Numerical Stability with Long Sequences", "topic": "flash-attention", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you debug and fix NaNs from Flash Attention at seq_len=32768 on TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3653", "title": "Diagnosing Flash Attention Regression After Library Upgrade", "topic": "flash-attention", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you investigate and fix the 25% throughput regression after upgrading flash-attn from 2.3 to 2.6?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3654", "title": "Kubernetes GPU Scheduling Fragmentation with H100 MIG", "topic": "container-orchestration", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the 14-GPU pod fail to schedule despite 16 MIG instances, and how would you fix the Kubernetes configuration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3655", "title": "HPA Scaling Latency for GPU Inference Pods", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you redesign autoscaling so the A100 inference service handles a 3x traffic spike before the HPA's 4-minute reaction time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3656", "title": "Multi-GPU Pod Affinity for NVLink-Dependent Workloads", "topic": "container-orchestration", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you schedule the 70B training pods so 8-GPU jobs stay within one node and avoid the 18x all-reduce slowdown?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3657", "title": "Container Image Size and Model Loading Latency on GPU Pods", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you reduce the 13B model pod cold start from 8 minutes to under 2 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3658", "title": "Resource Quota and GPU Memory Oversubscription in Kubernetes", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you prevent 70B MI300X workloads from OOM-killing other tenants when Kubernetes only tracks GPU count?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 2}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3659", "title": "Kubernetes Operator Design for Distributed Training Jobs", "topic": "container-orchestration", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you orchestrate the 64-GPU PyTorchJob so a single node preemption does not restart 3 hours of training from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3660", "title": "Custom Kubernetes Scheduler for GPU Memory-Aware Placement", "topic": "container-orchestration", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a Kubernetes scheduler plugin that places inference pods based on free GPU memory instead of GPU count?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 3}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3661", "title": "Kubernetes Network Policy for Secure Multi-Tenant GPU Inference", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you enforce network isolation between enterprise customers' pods across Kubernetes namespaces?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3663", "title": "Megatron-LM Column vs Row Parallelism for MLP Layers", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should W1 and W2 in the TP=4 MLP block be split between column and row parallelism, and what communication is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3664", "title": "Tensor Parallelism Communication Bottleneck on Multi-Node Cluster", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you change the TP=16 training setup to remove cross-node tensor-parallel all-reduces and recover MFU?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 2}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3665", "title": "Tensor Parallelism for Attention Heads: Optimal Head Distribution", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should you split Q and KV heads for Grouped-Query Attention (GQA) with 32 Q heads, 8 KV heads, and TP=4 on 4 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3666", "title": "Tensor Parallelism Correctness: Dropout and Random State Synchronization", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why can TP=4 training with dropout diverge from the single-GPU baseline despite the same seed, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3667", "title": "TP+PP Combined Parallelism Strategy for 540B Model", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What 3D parallelism strategy would you use to train the 540B model on 256 GPUs while targeting MFU > 40%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3668", "title": "Sequence Parallelism for Tensor-Parallel Activation Memory", "topic": "model-tensor-parallelism", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you fit batch_size=32 for the 30B TP=4 training run when activation memory is causing OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3669", "title": "Tensor Parallelism Embedding Layer Split and Vocabulary Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you tensor-parallelize the 128K-token embedding and LM head with TP=4, and what communication does it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3670", "title": "Tensor Parallelism Scaling Efficiency Measurement", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the scaling efficiency from TP=4 to TP=8, and what bottleneck explains the 2100 tokens/sec result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3671", "title": "Pod Disruption Budget Design for Zero-Downtime Model Rollouts", "topic": "container-orchestration", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you configure rolling updates so the 12-replica A100 deployment avoids P99 latency spikes during model rollout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3672", "title": "MoE Expert Routing Overhead", "topic": "mixture-of-experts", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you reduce the 64-expert MoE router overhead from 12% on a 4096-token batch without reducing the expert count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3673", "title": "Expert Load Imbalance Killing Throughput in MoE", "topic": "mixture-of-experts", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What causes one GPU to hit 100% utilization while the others idle, and how would you fix the load imbalance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3674", "title": "Designing MoE Expert Sharding for TPU v5e", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you shard 128 1B-parameter experts across 64 TPU v5e chips, and what all-to-all cost should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3675", "title": "MoE Auxiliary Loss Tuning for Stable Training", "topic": "mixture-of-experts", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is expert collapse occurring after 5k steps with alpha=0.01, and how would you fix the MoE load-balancing loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3676", "title": "MoE Inference Batching Strategy for Throughput", "topic": "mixture-of-experts", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Should you pad low-rate MoE batches to 64 tokens on MI300X, or use another approach to improve utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3677", "title": "MoE vs Dense Model Memory Tradeoff at Scale", "topic": "mixture-of-experts", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the memory and compute tradeoffs of the 140B top-2 MoE versus the 70B dense model, and when would you prefer the MoE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3678", "title": "ECMP Hashing Imbalance in All-Reduce Traffic", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the ECMP hashing imbalance, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3679", "title": "PFC Deadlock Prevention in RoCE Cluster", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign the RoCEv2/PFC configuration to prevent 45-second PFC-induced training stalls after topology changes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3680", "title": "Buffer Bloat Causing Gradient Staleness", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does switch buffer bloat cause >5-step gradient staleness, and what would you change to reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3681", "title": "Incast Collapse During Gradient Aggregation", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What failure mode is slowing the 512-worker parameter-server training job, and how would you fix the 32:1 fan-in?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3682", "title": "DCQCN Tuning for Large-Scale All-Reduce", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you tune DCQCN for the 400 GbE RoCEv2 cluster to stop the 90% to 40% throughput oscillation?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 2}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3683", "title": "Flow Completion Time vs Bandwidth Tradeoff in Gradient Sync", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why do large all-reduce gradients and small parameter-server control messages need different congestion-control treatment, and how would you support both?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3684", "title": "Multi-Tenant GPU Scheduling with SLO Guarantees", "topic": "scheduling-resource-management", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a scheduling system to guarantee Tenant A's SLO while maximizing Tenant B's GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3685", "title": "Bin-Packing GPU Jobs to Minimize Fragmentation", "topic": "scheduling-resource-management", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the cost of the 18% GPU fragmentation, and what allocation strategy would reduce over-provisioned 8-GPU jobs?", "chain_ids": ["cloud-chain-auto-021-07"], "chain_positions": {"cloud-chain-auto-021-07": 1}, "chain_tiers": {"cloud-chain-auto-021-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3686", "title": "Preemption Overhead in Long-Running Training Jobs", "topic": "scheduling-resource-management", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much overhead do the 4-hour preemptions and 18-minute NFS checkpoints add, and how would you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3687", "title": "Gang Scheduling for Distributed Training Efficiency", "topic": "scheduling-resource-management", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is all-or-nothing gang scheduling necessary for the 30 concurrent 8-GPU jobs, and what scheduler behavior would fix utilization?", "chain_ids": ["cloud-chain-auto-021-05"], "chain_positions": {"cloud-chain-auto-021-05": 1}, "chain_tiers": {"cloud-chain-auto-021-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3688", "title": "SLO-Aware Autoscaling for Inference Under Bursty Load", "topic": "scheduling-resource-management", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you autoscale 70B LLaMA serving to survive 3x spikes lasting 2-5 minutes when H100 cold starts take 4 minutes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3689", "title": "Spot Instance Preemption Handling in Training Clusters", "topic": "scheduling-resource-management", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a fault-tolerant checkpointing strategy for 32 A100 spot training so a 2-minute interruption notice loses under 10 minutes of work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3690", "title": "GPipe Bubble Overhead Calculation for 8-Stage Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With 8 pipeline stages and 64 micro-batches, what is the pipeline bubble fraction and effective GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3691", "title": "Interleaved Pipeline Schedule to Reduce Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would an interleaved 1F1B schedule with virtual stages change the bubble fraction and memory use for the 8-stage, m=16 pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3692", "title": "Stage Imbalance in Heterogeneous Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much throughput is lost because Stage 4 is 35% slower, and how would you rebalance the 4-stage pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3693", "title": "Micro-batch Size Selection Under Memory Constraints", "topic": "pipeline-parallelism", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the maximum micro-batch size you can use with 8 micro-batches in flight?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 0}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3694", "title": "Combining Pipeline and Tensor Parallelism for 1T Parameter Model", "topic": "pipeline-parallelism", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What data, tensor, and pipeline parallelism factors would you choose for a 1T-parameter model on 1024 H100s under a 70GB HBM limit, and why?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 3}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3695", "title": "Online vs Offline Feature Store Consistency Tradeoffs", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are 0.8% of recommendation requests serving >24h-stale batch features, and how would you detect and mitigate that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3696", "title": "Feature Serving Latency Budget for Two-Tower Models", "topic": "feature-store-management", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign feature retrieval to bring p99 latency from 18ms under the 5ms budget for the 50ms two-tower SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3697", "title": "Feature Skew Between Training and Serving", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you confirm and fix the training-serving skew?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 2}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3698", "title": "Feature Store Freshness SLAs for Time-Sensitive Models", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What freshness SLA should each fraud feature tier have, and how would the feature store enforce those SLAs?", "chain_ids": ["cloud-chain-auto-020-15"], "chain_positions": {"cloud-chain-auto-020-15": 0}, "chain_tiers": {"cloud-chain-auto-020-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3699", "title": "Feature Store Versioning for Safe Model Rollouts", "topic": "feature-store-management", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you version features so the v2 recommendation model can roll out gradually while v1 continues serving safely?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3700", "title": "Speculative Decoding Draft Model Selection for 70B LLM", "topic": "speculative-decoding", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What speedup should you expect from a 7B draft model for speculative decoding of the 70B target, and what acceptance rates make it worthwhile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3701", "title": "Priority-Based GPU Scheduling for Mixed Training and Inference", "topic": "scheduling-resource-management", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule inference and elastic training on the 256-GPU cluster to use overnight capacity without violating the p99 < 100ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3702", "title": "Roofline Ceiling Identification on H100 for Transformer MLP Blocks", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the 4096x4096x16384 BF16 GEMM on H100 compute- or memory-bound, and how large is the performance gap?", "chain_ids": ["cloud-chain-auto-secondary-001-02"], "chain_positions": {"cloud-chain-auto-secondary-001-02": 3}, "chain_tiers": {"cloud-chain-auto-secondary-001-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3703", "title": "Memory-Bound vs Compute-Bound Classification for Attention", "topic": "roofline-analysis", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 2048-token, 32-head QK^T attention operation compute- or memory-bound, and what should you use to improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3704", "title": "Flash Attention Arithmetic Intensity Analysis on H100", "topic": "attention-scaling", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How much memory traffic does FlashAttention-2 save versus materializing the attention matrix, and what bandwidth speedup results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3705", "title": "Multi-Query vs Multi-Head Attention KV Cache Trade-off", "topic": "attention-scaling", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do MHA, MQA, and 8-group GQA compare in KV-cache memory and decode throughput for batch 32 at 4096 context on H100 80GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3706", "title": "INT8 Quantization Calibration Strategy for LLM Inference on H100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the accuracy and hardware-efficiency tradeoffs of per-tensor versus per-channel INT8 quantization for the 13B LLaMA model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3707", "title": "FP8 Format Selection for H100 Mixed Precision Training", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which FP8 formats should you use for forward and backward passes of the 7B transformer given gradients spanning 1e-7 to 1e3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3708", "title": "HBM3 vs L2 Cache Access Patterns for Transformer Weights on H100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Will the 402MB QKV weight matrix fit in L2 cache during batch-1 decode, and what per-token latency does HBM streaming imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3709", "title": "MI300X Unified Memory Architecture for Large Model Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can a single MI300X serve the 70B FP16 model without tensor parallelism, and how does its decode throughput compare to 2x H100s with tensor parallelism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3710", "title": "Continuous Batching vs Static Batching Throughput on H100 for LLM Serving", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much throughput improvement should continuous batching provide over static batching for 2048-token slots with 500-token average requests?", "chain_ids": ["cloud-chain-auto-021-08"], "chain_positions": {"cloud-chain-auto-021-08": 3}, "chain_tiers": {"cloud-chain-auto-021-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3711", "title": "Prefill Batch Sizing for TTFT Optimization on A100", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What prefill batch size likely saturates the A100, and why is the TTFT violation threshold under-specified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3712", "title": "H100 SXM Rack Power Budget for Training", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 8 H100 SXM GPUs drawing 680W each in a 40kW budget, what headroom remains and would a 700W 9th GPU fit?", "chain_ids": ["cloud-chain-auto-015-02"], "chain_positions": {"cloud-chain-auto-015-02": 3}, "chain_tiers": {"cloud-chain-auto-015-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3713", "title": "Power vs Performance Trade-off with GPU Frequency Scaling on A100", "topic": "power-budgeting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "If a GPU is power-capped from 400W to 300W, what frequency and throughput reduction follow after accounting for static power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3714", "title": "AllReduce Bandwidth Requirement for Ring All-Reduce on 8×H100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does ring all-reduce take for 52GB FP32 gradients across 8 H100s on NVLink, and can it overlap with backward compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3715", "title": "ReduceScatter vs AllReduce for Pipeline Parallelism on H100 Cluster", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do ZeRO-1 and ZeRO-3 per-rank communication patterns compare within 2-way DP groups for an 8-stage PP 70B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3716", "title": "AllToAll Communication for Expert Parallelism in MoE on TPU v5e", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What AllToAll volume and bandwidth lower bound does top-2 MoE dispatch create for batch=512, seq_len=2048, dim=1024?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3717", "title": "Speculative Decoding Acceptance Rate Impact on H100 Inference Throughput", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What approximate throughput improvement can speculative decoding deliver with k=5 and a=0.8 versus 30 tok/s target-only decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3719", "title": "Structured Pruning 30% of MLP Neurons in a 70B LLM on MI300X", "topic": "pruning-sparsity", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What parameter reduction, memory savings, and decode throughput speedup result from pruning 30% of MLP neurons in the 70B model?", "chain_ids": ["cloud-chain-auto-005-01"], "chain_positions": {"cloud-chain-auto-005-01": 2}, "chain_tiers": {"cloud-chain-auto-005-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3720", "title": "Thermal Throttling During Long-Context Training on H100", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What throughput loss and recovery-time impact should you expect from throttling from 1.41GHz to 1.19GHz after the 5°C ambient rise?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3721", "title": "Compute-Optimal Scaling Laws Verification with Roofline on A100", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For a 50B model trained on 1T tokens across 1024 A100s, what are the training time, memory footprint, and roofline bottleneck?", "chain_ids": ["cloud-chain-auto-secondary-001-01"], "chain_positions": {"cloud-chain-auto-secondary-001-01": 4}, "chain_tiers": {"cloud-chain-auto-secondary-001-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3722", "title": "GPTQ vs AWQ Quantization Quality on H100 for 70B LLM", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Given similar inference speed, would you ship GPTQ or AWQ for Llama-3-70B 4-bit production, and what trade-offs besides PPL matter?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3723", "title": "Tensor Parallelism GEMM Partitioning on H100 for LLM Layer", "topic": "model-serving-infrastructure", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the per-layer communication cost of 8-way tensor-parallel QKV/output projections, and how efficient is it versus ideal scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3724", "title": "Memory Bandwidth Bottleneck Diagnosis for LLM Decode on MI300X", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the MI300X deployment reaching only 63% HBM bandwidth at batch_size=4, and how would you investigate the gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3725", "title": "SLO Violation Root Cause Analysis for LLM Service on TPU v5e", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What likely causes P99 TTFT to jump to 2.1s while P50 stays 180ms on the TPU v5e Gemma-7B service, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3726", "title": "AllReduce Topology Comparison: Ring vs Tree on H100 IB Cluster", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare ring all-reduce, recursive halving-doubling, and a binary tree reduction: which minimizes wall-clock time for this payload?", "chain_ids": ["cloud-chain-auto-002-09"], "chain_positions": {"cloud-chain-auto-002-09": 0}, "chain_tiers": {"cloud-chain-auto-002-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3727", "title": "Batch Size Selection for Maximum MFU on A100 During Pre-Training", "topic": "batching-strategies", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does MFU peak at batch_size=128 and drop at 512, and which batch size is optimal for this training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3728", "title": "Carbon Footprint Estimation for LLM Training Run", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the total energy consumption, electricity cost, and carbon footprint for the 21.7-day, 1024-GPU training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3729", "title": "Benchmark MFU vs Achieved FLOPS for Attention vs MLP on H100", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does attention achieve only 31% MFU versus 67% for MLP GEMMs, and what changes would improve overall MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3730", "title": "Prefill-Decode Disaggregation Architecture Specification for H100 Fleet", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How many prefill vs decode GPUs would you use, how would you transfer KV cache, and how would you load-balance 1000 req/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3731", "title": "Multi-Level Memory Hierarchy Specification for 405B LLM Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tiered memory strategy or limits are required to serve 500 concurrent 8192-token users for Llama-3-405B FP8 on H100s?", "chain_ids": ["cloud-chain-auto-012-05"], "chain_positions": {"cloud-chain-auto-012-05": 4}, "chain_tiers": {"cloud-chain-auto-012-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3732", "title": "Roofline Model for Transformer Encoder vs Decoder on TPU v5e", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the arithmetic intensities and roofline bottlenecks for BERT-large encoder batches and GPT-2-large single-token decoding on TPU v5e?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3733", "title": "Activation Checkpointing Need for 70B Training on 64 A100s", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does this 8-way TP and 8-way DP 70B training setup need activation checkpointing at batch=8 on 64 A100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3734", "title": "Gradient Compression Impact on AllReduce Bandwidth on H100 Cluster", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What net throughput gain and accuracy risk should you expect from 99.9% Top-K gradient sparsification of 26GB gradients on 32 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3735", "title": "NCCL vs MPI AllReduce Performance for Mixed-Precision Training on H100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How long should NCCL hierarchical AllReduce and tree-based MPI reduction take for 52GB gradients on the 64-H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3736", "title": "Long-Context Attention with Ring Attention on H100 Multi-Node", "topic": "attention-scaling", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the KV communication volume per ring step and sequential non-overlapped total ring communication time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3737", "title": "Inference Serving Cost Optimization: Batching Strategy for RAG", "topic": "batching-strategies", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you batch the 10,000 daily embeddings versus 1,000 daily generations to minimize compute cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3738", "title": "Kernel Fusion Impact on Memory Bandwidth for Layer Norm + GELU on H100", "topic": "model-serving-infrastructure", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What speedup should fusing LayerNorm and GELU deliver by reducing HBM traffic from 8GB to 4GB for the 2GB activation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3739", "title": "Disaggregated Serving with Chunked Prefill on A100 80GB", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "With a 25ms decode-stall target, how would you choose a chunked-prefill size and what TTFT trade-off results?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3740", "title": "Pipeline Bubble Analysis for 4-Stage Pipeline Parallelism on H100", "topic": "batching-strategies", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What pipeline bubble fraction and GPU utilization result from 4 stages and 4 micro-batches with 50ms forward plus 50ms backward per stage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3741", "title": "Dynamic Sparse Attention Pattern Implementation on H100 for Code LLM", "topic": "attention-scaling", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "On an H100, will dynamic sparse attention with 288 attended positions per token at 8K context be faster than dense FlashAttention, and why?", "chain_ids": ["cloud-chain-auto-secondary-013-05"], "chain_positions": {"cloud-chain-auto-secondary-013-05": 2}, "chain_tiers": {"cloud-chain-auto-secondary-013-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3742", "title": "ZeRO-3 Communication Volume Analysis for 175B Model on 256 A100s", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much ZeRO-3 AllGather communication occurs per forward pass for GPT-3 175B on 256 GPUs, and how does it compare to compute time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3743", "title": "H100 SM Occupancy Analysis for Small-Batch Inference Kernel", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What SM occupancy does this batch=1 GEMM achieve on H100, and why does launching only 32 thread blocks underutilize the GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3744", "title": "Activation Checkpointing Granularity Trade-off for 405B Model Training on H100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Analyze the memory-compute trade-off and determine whether you should use full or selective activation checkpointing?", "chain_ids": ["cloud-chain-auto-012-07"], "chain_positions": {"cloud-chain-auto-012-07": 3}, "chain_tiers": {"cloud-chain-auto-012-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3745", "title": "PUE Reality Check: Cooling Overhead in a Hyperscale Cluster", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much total power does the facility draw, and how many megawatts are consumed purely by cooling, lighting, and power distribution overhead?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 1}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3746", "title": "Rack Power Density Limits with GPU-Dense Nodes", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many DGX nodes can you fit per rack, and what infrastructure changes are needed to reach 80 kW/rack density?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3747", "title": "Carbon-Aware Scheduling: Shifting Training to Low-Carbon Hours", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much CO2 can you realistically save by shifting 30% of the workload to low-carbon hours, and what are the engineering tradeoffs?", "chain_ids": ["cloud-chain-auto-020-08"], "chain_positions": {"cloud-chain-auto-020-08": 1}, "chain_tiers": {"cloud-chain-auto-020-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3748", "title": "The Cooling Cliff: Air vs Liquid at GPU Density", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the thermal challenge, and how do you solve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3749", "title": "Stranded Power: When GPU Utilization Tanks PUE", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the effective PUE when utilization is factored in, and how do you improve energy proportionality?", "chain_ids": ["cloud-chain-auto-021-01"], "chain_positions": {"cloud-chain-auto-021-01": 3}, "chain_tiers": {"cloud-chain-auto-021-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3750", "title": "TPU Pod Power Budgeting vs GPU Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which cluster is more power-efficient, and what architectural decisions drive the difference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3751", "title": "Water Usage Effectiveness: The Hidden Cost of Evaporative Cooling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much water does this datacenter consume daily, and why is WUE becoming a critical metric for AI infrastructure?", "chain_ids": ["cloud-chain-auto-020-09"], "chain_positions": {"cloud-chain-auto-020-09": 1}, "chain_tiers": {"cloud-chain-auto-020-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3752", "title": "Power Delivery: UPS Efficiency at Partial Load", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the annual energy wasted in UPS conversion losses, and how do you architect around this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3753", "title": "Compute Density vs Power: Planning a 100 MW AI Campus", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which cooling architecture would you recommend for a 100 MW campus with PUE ≤ 1.08 and 100 kW/rack density, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3754", "title": "Measuring True Energy per Token in Training", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the energy per token for the 512-H100, 2-week, 2T-token run, how does it compare to benchmarks, and how would you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3755", "title": "Thermal Throttling: When GPUs Self-Protect", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What causes the periodic step time increase to 3.5 seconds, and how do you resolve it?", "chain_ids": ["cloud-chain-auto-021-03"], "chain_positions": {"cloud-chain-auto-021-03": 0}, "chain_tiers": {"cloud-chain-auto-021-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3756", "title": "GPipe Bubble Overhead: The Pipeline Efficiency Tax", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What pipeline bubble fraction results from partitioning 40 layers across 4 GPUs with 8 micro-batches under GPipe?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 1}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3757", "title": "1F1B vs GPipe: Memory Advantage of Interleaved Scheduling", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the peak activation memory per GPU under GPipe versus 1F1B, and why does 1F1B win on memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3758", "title": "Interleaved Pipeline Stages: Halving the Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does an interleaved pipeline schedule reduce the pipeline bubble compared to standard 1F1B, and what is the communication cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3759", "title": "Pipeline Parallelism vs Tensor Parallelism: When to Choose Which", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why should tensor parallelism stay within each MI300X node rather than crossing the 400 Gb/s InfiniBand links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3760", "title": "Micro-Batch Size Selection for Pipeline Efficiency", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the minimum number of micro-batches to keep the PP=4 bubble under 10%, what micro-batch size follows, and what is the activation memory tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3761", "title": "Pipeline Stalls from Unbalanced Stage Partitioning", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the throughput impact of stage 0 being 1.8× slower, and how would you rebalance the pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3762", "title": "Pipeline Parallelism Activation Checkpointing Interaction", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "With selective activation checkpointing, how much activation memory do you save per stage and what recomputation cost should you expect?", "chain_ids": ["cloud-chain-auto-023-17"], "chain_positions": {"cloud-chain-auto-023-17": 2}, "chain_tiers": {"cloud-chain-auto-023-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3763", "title": "Pipeline Bubble in Zero-Bubble Scheduling", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does zero-bubble pipeline scheduling work, what memory cost does it add, and does it truly achieve zero bubble?", "chain_ids": ["cloud-chain-auto-023-15"], "chain_positions": {"cloud-chain-auto-023-15": 4}, "chain_tiers": {"cloud-chain-auto-023-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3764", "title": "Pipeline Drain Latency in Inference Serving", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the time-to-first-token (TTFT) latency penalty from the pipeline, and how does continuous batching interact with pipeline parallelism for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3765", "title": "mmap for Zero-Copy Model Loading", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the cold-start times and memory implications for malloc+read versus mmap, and when does each loading strategy win?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3767", "title": "mmap vs Safetensors: Cold Start Optimization for Model Serving", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For sub-3-second cold starts across 50 safetensors 7B models, how do NVMe read, mmap, and host-RAM caching compare?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3769", "title": "Shared mmap for Multi-Tenant GPU Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a memory-efficient architecture using mmap to share the 26 GB model across 8 workers, and how would you manage the GPU memory?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3770", "title": "mmap and NUMA: The Hidden Latency Trap", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is cudaMemcpy to GPU 1 60% slower after mmap, and how would you design NUMA-aware multi-GPU model loading?", "chain_ids": ["cloud-chain-auto-secondary-017-19"], "chain_positions": {"cloud-chain-auto-secondary-017-19": 3}, "chain_tiers": {"cloud-chain-auto-secondary-017-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3771", "title": "Least-Connections Routing for Heterogeneous GPU Serving", "topic": "load-balancing", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design a load-balancing strategy that accounts for heterogeneous hardware capacities?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3772", "title": "Consistent Hashing for Model-Aware Request Routing", "topic": "load-balancing", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does consistent hashing reduce model remapping when a GPU fails, and how would you mitigate the 3-second cold-start penalty?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3774", "title": "FlashAttention Tiling: Why SRAM Size Determines Tile Shape", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which FlashAttention tile sizes fit in 256 KB SRAM for N=4096, d=128, and why does tiling reduce HBM traffic from O(N²) to O(N)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3775", "title": "FlashAttention-2 vs Standard Attention: Wall-Clock Speedup", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using IO-complexity reduction, what FlashAttention-2 speedup should you expect over the 85 ms standard attention forward pass?", "chain_ids": ["cloud-chain-auto-014-05"], "chain_positions": {"cloud-chain-auto-014-05": 0}, "chain_tiers": {"cloud-chain-auto-014-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3776", "title": "PagedAttention: Virtual Memory for KV Cache", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does PagedAttention improve KV-cache utilization, and what approximate throughput gain follows?", "chain_ids": ["cloud-chain-auto-014-04"], "chain_positions": {"cloud-chain-auto-014-04": 0}, "chain_tiers": {"cloud-chain-auto-014-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3777", "title": "Ring Attention for Million-Token Contexts", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is Ring Attention feasible for this 1M-token KV cache on 8 MI300X GPUs, and what pattern does it use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3778", "title": "Prefix Caching: Amortizing System Prompts", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much GPU time is wasted recomputing the 2000-token system prompt, and what annual savings does prefix caching provide at $2 per GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3779", "title": "FlashAttention Backward Pass: The Recomputation Trade", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does FlashAttention handle the missing attention matrix during backward, and what extra compute cost does recomputation add?", "chain_ids": ["cloud-chain-auto-014-03"], "chain_positions": {"cloud-chain-auto-014-03": 1}, "chain_tiers": {"cloud-chain-auto-014-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3780", "title": "FlashAttention on TPU: XLA Attention vs Pallas Kernels", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What architectural differences matter for FlashAttention-style kernels on TPU v5e versus GPUs, and is a manual JAX Pallas kernel worth it?", "chain_ids": ["cloud-chain-auto-014-08"], "chain_positions": {"cloud-chain-auto-014-08": 1}, "chain_tiers": {"cloud-chain-auto-014-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3781", "title": "IO-Awareness: Roofline Model for Attention Kernels", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is standard attention memory-bound on an H100, what arithmetic intensity is needed to be compute-bound, and what is standard attention's AI?", "chain_ids": ["cloud-chain-auto-014-02"], "chain_positions": {"cloud-chain-auto-014-02": 0}, "chain_tiers": {"cloud-chain-auto-014-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3782", "title": "FlashDecoding: Parallelizing Attention During Inference", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does FlashDecoding parallelize a single-token decode over KV blocks for a 32K context, and what throughput improvement does it provide?", "chain_ids": ["cloud-chain-auto-014-07"], "chain_positions": {"cloud-chain-auto-014-07": 0}, "chain_tiers": {"cloud-chain-auto-014-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3783", "title": "Multi-Query vs Grouped-Query Attention Memory Savings", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache memory does 4096-token context require for MHA, GQA, and MQA in the 70B model, and how does this affect FlashAttention-2 tiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3784", "title": "FlashAttention Memory Savings Enable Longer Training Contexts", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With only 2 GiB left for activations, what max sequence length can standard attention support, and how does FlashAttention change it?", "chain_ids": ["cloud-chain-auto-014-03"], "chain_positions": {"cloud-chain-auto-014-03": 0}, "chain_tiers": {"cloud-chain-auto-014-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3785", "title": "Causal Masking in FlashAttention: Skipping Unnecessary Tiles", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does FlashAttention-2 exploit the causal lower-triangular mask when tiling, and what FLOP savings does it get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3786", "title": "FlashAttention-2 Compute-Bound Scaling vs HBM Bandwidth", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For compute-bound FlashAttention-2, how does the comparative performance shift, and which architecture benefits more relative to its standard attention baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3787", "title": "Sliding Window Attention with FlashAttention Tiling", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does W=4096 sliding-window attention interact with FlashAttention tiling for a 32K context, and what memory and compute savings result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3788", "title": "FlashAttention-3: Asynchronous Tiling on Hopper", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do TMA and warp specialization let FlashAttention-3 outperform FlashAttention-2, and what peak TFLOPS percentage does it reach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3789", "title": "Chunked Prefill: Balancing TTFT and Decode Throughput", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does chunked prefill with FlashAttention prevent 8192-token prefills from stalling decode, and what chunk size should you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3790", "title": "Online Softmax: The Numerical Foundation of FlashAttention", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How are the running max and running sum maintained across tiles, and why is this critical for correctness?", "chain_ids": ["cloud-chain-auto-014-06"], "chain_positions": {"cloud-chain-auto-014-06": 0}, "chain_tiers": {"cloud-chain-auto-014-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3791", "title": "FlashAttention Kernel Fusion Benefits", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does FlashAttention kernel fusion reduce overhead versus five PyTorch attention kernels, and what two overhead types are eliminated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3792", "title": "Attention Sink Tokens and KV Cache Eviction", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the attention sink phenomenon, and what KV-cache eviction policy preserves quality for 128K-context streaming on an 80 GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3793", "title": "FlashAttention with Variable-Length Sequences in a Batch", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For lengths [128, 256, 512, 1024, 2048, 4096, 8192, 16384], what is the correct total unpadded attention work ΣN²?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3794", "title": "FlashAttention-2 Warp Partitioning Strategy", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What synchronization overhead does FlashAttention-2 eliminate by partitioning across the Q sequence dimension, and what is the correct latency-savings range?", "chain_ids": ["cloud-chain-auto-014-02"], "chain_positions": {"cloud-chain-auto-014-02": 3}, "chain_tiers": {"cloud-chain-auto-014-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3795", "title": "FlashAttention Roofline Crossover on MI300X", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At what sequence length does FlashAttention prefill become compute-bound on an MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3797", "title": "FlashAttention for Cross-Attention in Multimodal Models", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is this cross-attention operation memory-bound or compute-bound with standard attention, and how does FlashAttention-2 change this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3798", "title": "MoE Routing: Top-K Gating and Load Imbalance", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 256 tokens, 8 experts, and top-2 routing, how many tokens should each expert process uniformly, and what is the impact of the observed imbalance?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 0}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3799", "title": "MoE Memory Footprint: Why Sparse Models Need More VRAM", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does Mixtral 8x7B's VRAM requirement compare with a dense 13B model, and why can't you load only the active 13B parameters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3800", "title": "Expert Parallelism: All-to-All Communication Pattern", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the MoE All-to-All communication volume, and why is All-to-All the expert-parallel collective instead of AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3801", "title": "Capacity Factor Tuning: Quality vs Throughput", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should you reduce the MoE capacity factor from 1.5 to 1.0, and what are the utilization-versus-token-drop tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3802", "title": "Auxiliary Load Balancing Loss: Mechanism and Coefficient", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What fraction f_i does each of the 5 nearly dormant experts receive if they uniformly share the remaining 20% of routed tokens?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 1}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3803", "title": "MoE Inference Latency: The Expert Loading Problem", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using 3.35 TB/s HBM3 bandwidth, what is the dense 13B INT4 weight-read lower bound for batch-1 decode latency?", "chain_ids": ["cloud-chain-auto-013-03"], "chain_positions": {"cloud-chain-auto-013-03": 2}, "chain_tiers": {"cloud-chain-auto-013-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3804", "title": "Expert Parallelism + Tensor Parallelism: Hybrid MoE Training", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you combine expert parallelism, tensor parallelism, and data parallelism for the 1.6T-parameter MoE on 256 MI300X GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3805", "title": "Token Dropping in MoE: Quality Impact at Scale", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With an 8% token drop rate under top-2 routing, is the perplexity gap primarily due to token dropping, and what fraction of tokens lose an expert assignment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3806", "title": "MoE FLOP Efficiency: Why Sparse Models Train Faster", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How do per-token and same-quality training FLOPs compare for the 1.6T MoE versus the dense 175B model?", "chain_ids": ["cloud-chain-auto-013-01"], "chain_positions": {"cloud-chain-auto-013-01": 1}, "chain_tiers": {"cloud-chain-auto-013-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3807", "title": "Sparse Gating: Softmax vs Sigmoid Router Design", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the implications of softmax versus sigmoid gating for load balancing, gradient flow, and expert specialization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3808", "title": "Megablocks: Variable-Length Expert Computation", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does block-sparse GEMM remove capacity padding, and for batch 512, 8 experts, and capacity factor 1.5, what padding remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3809", "title": "Expert Choice Routing: Inverting the Selection", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do Token Choice and Expert Choice routing compare on load balancing, token drops, and adaptive expert computation per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3810", "title": "MoE Batch Size Scaling: Sparsity Advantage Vanishes", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does MoE's latency scale worse with batch size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3811", "title": "Fine-Grained MoE: More Experts, Smaller Each", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At batch size 256, how many distinct experts should you expect to activate out of the 160?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3812", "title": "InfiniBand All-to-All Transfer Time", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How long does a 4 MB All-to-All transfer take over a 50 GB/s InfiniBand link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3813", "title": "Expert Offloading: CPU-GPU Expert Swapping for Inference", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is CPU expert swapping over PCIe 4.0 viable for serving Mixtral 8x7B under a 50 ms/token interactive latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3814", "title": "Shared vs Routed Experts: DeepSeek's Hybrid Architecture", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why include 2 shared experts alongside 160 routed experts, and what systems benefits does this hybrid MoE design provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3815", "title": "MoE Training Instability: Expert Collapse Diagnosis", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused this expert collapse, and how do you recover the training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3816", "title": "MoE Gradient Computation: The Router Gradient Challenge", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What gradient bias does the straight-through top-k estimator introduce, and what alternative routing-gradient approaches exist?", "chain_ids": ["cloud-chain-auto-013-02"], "chain_positions": {"cloud-chain-auto-013-02": 3}, "chain_tiers": {"cloud-chain-auto-013-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3817", "title": "MoE Serving with Expert Caching", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a GPU expert cache for a 64-expert top-2 MoE when most conversations activate only 12-15 experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3818", "title": "MoE vs Dense: Cost-per-Token Comparison for Training", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the dense 70B and MoE 8x70B options compare in total GPU-hours and cost at $2/GPU-hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3819", "title": "MoE Expert Pruning for Inference Optimization", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can you prune the 4 low-traffic experts for inference, and what memory savings and quality impact should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3820", "title": "MoE with Tensor Parallelism: Splitting Experts Across GPUs", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you lay out tensor parallelism within each 25B-parameter expert and expert parallelism across 64 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3821", "title": "MoE Quantization: Expert-Level vs Layer-Level Calibration", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong with the uniform GPTQ calibration, and how do you fix the severe quality degradation in specialized experts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3822", "title": "MoE Architecture: Router Network Design and Placement", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Which layers are replaced with MoE layers, and how does the router network interact with the expert MLPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3829", "title": "Shadow Deployment PCIe Bottleneck", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling 100% shadow traffic degrade performance despite low H100 compute utilization and free HBM bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3830", "title": "H100 Batch Size 1 Compute Collapse", "topic": "accelerator-comparison", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does H100 compute utilization collapse below 1% for batch-1 inference with an arithmetic intensity of 1 FLOP/Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3831", "title": "Analyzing Training Time Overhead of Gradient Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does full gradient checkpointing add exactly 100 ms to the training step, and what compute-memory tradeoff causes it?", "chain_ids": ["cloud-chain-auto-027-24"], "chain_positions": {"cloud-chain-auto-027-24": 0}, "chain_tiers": {"cloud-chain-auto-027-24": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3832", "title": "LLM Inference Utilization at Batch Size 1", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does compute utilization collapse at batch size 1, and why is peak-TFLOPS cost estimation flawed for this workload?", "chain_ids": ["cloud-chain-auto-secondary-015-07"], "chain_positions": {"cloud-chain-auto-secondary-015-07": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3833", "title": "PFC Congestion Spreading and Cluster Throughput Collapse", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a localized receiver bottleneck during RoCEv2 All-to-All cause cluster-wide throughput collapse under PFC?", "chain_ids": ["cloud-chain-auto-020-10"], "chain_positions": {"cloud-chain-auto-020-10": 0}, "chain_tiers": {"cloud-chain-auto-020-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3834", "title": "Kubernetes Pod Affinity and GPU Bandwidth", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did default Kubernetes pod spreading bottleneck training, and how did co-locating the 4 pods on one NVLink node fix it?", "chain_ids": ["cloud-chain-auto-021-12"], "chain_positions": {"cloud-chain-auto-021-12": 0}, "chain_tiers": {"cloud-chain-auto-021-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3835", "title": "Optimal Checkpoint Interval Scaling Tradeoffs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does doubling the GPU count reduce the optimal checkpoint interval when checkpoint write time is unchanged?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3836", "title": "Explaining Stale Online Features with Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are recommendations stale despite 10ms inference and 20% GPU utilization, and where is the true bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3837", "title": "FlashAttention Arithmetic Intensity Shift", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does standard attention underutilize the A100's compute capacity, and how does FlashAttention's SRAM tiling shift the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3838", "title": "Uncoalesced Memory Access in Gather Kernel", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the strided gather saturate 2.0 TB/s HBM bandwidth but deliver only about 125 GB/s of useful data?", "chain_ids": ["cloud-chain-auto-secondary-010-21"], "chain_positions": {"cloud-chain-auto-secondary-010-21": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3839", "title": "Throughput Collapse in Fallback Model Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does switching from a batched 30B model to a batch-1 7B fallback reduce total tokens/second instead of shedding load?", "chain_ids": ["cloud-chain-auto-secondary-015-16"], "chain_positions": {"cloud-chain-auto-secondary-015-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3840", "title": "Ring-AllReduce Cross-Node Bottleneck", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does Ring-AllReduce jump from 0.155 s on 8 GPUs to 3.0 s on 16 GPUs across two nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3841", "title": "Compiler Heuristics for MatMul Fusion", "topic": "graph-compilation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why would the compiler avoid fusing the compute-bound MatMul with GeLU despite the usual memory-bandwidth benefit of fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3842", "title": "Analyzing Memory Bounds in Unfused Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are separate GeLU, Dropout, and Scale kernels memory-bound, and how does fusing them change HBM traffic?", "chain_ids": ["cloud-chain-auto-secondary-004-22"], "chain_positions": {"cloud-chain-auto-secondary-004-22": 2}, "chain_tiers": {"cloud-chain-auto-secondary-004-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3843", "title": "Offline Distillation PCIe Bandwidth Bottleneck", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does offline distillation with full-vocabulary teacher logits collapse A100 utilization despite a small 100M student model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3844", "title": "KV Cache Pre-allocation OOM on A100 GPU", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the service OOM on the 25th request when each active request uses only 100 tokens of context, and the maximum KV cache for 4096 tokens is exactly 2 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3845", "title": "H100 Vector Addition Utilization Collapse", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FP16 vector addition on a GPU show less than 0.1% compute utilization despite the GPU's 989 TFLOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3846", "title": "Gradient Accumulation Memory Reduction Discrepancy", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does gradient accumulation reduce memory to 58 GB instead of 22 GB?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3847", "title": "Analyzing FP16 Speedup on A100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does switching this [16, 4096] x [4096, 4096] layer from FP32 to FP16 produce exactly a 2x speedup on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3848", "title": "MoE Capacity Factor Communication Overhead", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does raising the MoE capacity factor from 1.0 to 1.5 increase All-To-All latency by 50% even though valid routed tokens are unchanged?", "chain_ids": ["cloud-chain-auto-013-04"], "chain_positions": {"cloud-chain-auto-013-04": 0}, "chain_tiers": {"cloud-chain-auto-013-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3849", "title": "Analyzing Low Compute Utilization in Memory-Bound Kernels", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is compute utilization capped near 13% when the layer's arithmetic intensity is 20 FLOPs/Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3850", "title": "H100 Throughput Drop at Constant Power Limit", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the clock speed and GEMM throughput drop after 2 minutes even though power draw remains at the 700W limit?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3851", "title": "OOM During Optimizer Initialization on A100", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does AdamW initialization OOM on an 80 GB A100 after loading only 14 GB of FP16 weights for a 7B model?", "chain_ids": ["cloud-chain-auto-008-16"], "chain_positions": {"cloud-chain-auto-008-16": 1}, "chain_tiers": {"cloud-chain-auto-008-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3852", "title": "Multi-Tier LLM Fallback and Load Shedding", "topic": "graceful-degradation", "competency_area": "reliability", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design request routing, degradation, and load shedding to survive a 10,000 QPS spike while prioritizing premium users?", "chain_ids": ["cloud-chain-auto-secondary-015-16"], "chain_positions": {"cloud-chain-auto-secondary-015-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3853", "title": "Architecting an Online Distillation Pipeline for a 70B Model", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect the 70B-to-7B distillation pipeline to avoid logit I/O bottlenecks and keep student training busy?", "chain_ids": ["cloud-chain-auto-secondary-015-29"], "chain_positions": {"cloud-chain-auto-secondary-015-29": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-29": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3854", "title": "KV-Cache Aware LLM Routing", "topic": "load-balancing", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route variable-length LLM requests to maximize KV-cache hits and utilization while avoiding hot spots and OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3855", "title": "Architecting Mixed-Precision Training for a 100B LLM", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What mixed-precision formats would you use for weights, activations, gradients, and optimizer states when training the 100B LLM, and why?", "chain_ids": ["cloud-chain-auto-secondary-015-02"], "chain_positions": {"cloud-chain-auto-secondary-015-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3856", "title": "Distributed Serving Architecture for 70B LLM", "topic": "model-size-estimation", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you parallelize and allocate memory for serving the 70B FP16 chat model on each 8x H100 node to support high concurrency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3857", "title": "MoE Network Placement and Parallelism Strategy", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you place TP, PP, DP, and EP for the 1.2T MoE model to keep token all-to-all off the slow InfiniBand fabric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3858", "title": "Architecting RDMA Transport for H100 Distributed Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you architect the RDMA transport, memory registration, and buffering for the 128-node ring all-reduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3859", "title": "Real-time LLM Safety Guardrail Architecture", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you co-locate the three 1B safety classifiers on H100s or deploy them on A100 microservices to meet a 40ms P99 latency budget?", "chain_ids": ["cloud-chain-auto-secondary-015-33"], "chain_positions": {"cloud-chain-auto-secondary-015-33": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3860", "title": "Multimodal Sharded Storage Architecture", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you architect the storage format and data loading to avoid I/O bottlenecks and minimize object storage costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3861", "title": "Rack-Level Thermal Architecture for Dense H100 Clusters", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you schedule and power-cap the 5-node H100 racks to maximize throughput without exceeding the 30 kW cooling limit?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 3}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3862", "title": "Pod Placement Bandwidth Bottleneck", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What Kubernetes scheduling behavior explains a 4 GB activation transfer taking 80 ms between the two 4-GPU Pods?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3863", "title": "Synthetic Data PCIe Bottleneck in 3D Imaging", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the H100 reach only about 22% MFU when synthetic 3D batches are generated with zero CPU latency?", "chain_ids": ["cloud-chain-auto-secondary-015-41"], "chain_positions": {"cloud-chain-auto-secondary-015-41": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3864", "title": "Diagnosing Low Host-to-Device Bandwidth", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 4 GB host-to-device batch transfer take 250 ms over PCIe Gen5 instead of the theoretical minimum, and how can it be fixed?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3865", "title": "Diagnosing Low Decoder MFU on A100", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the decoder phase fall to roughly 0.6% MFU at batch size 1 while the encoder achieves 45% MFU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3866", "title": "Diagnosing High Inference Latency on A100", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is average end-to-end latency 200 ms when the execution time is 40 ms and arrival rate is 20 requests per second?", "chain_ids": ["cloud-chain-auto-024-14"], "chain_positions": {"cloud-chain-auto-024-14": 0}, "chain_tiers": {"cloud-chain-auto-024-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3867", "title": "Toxicity Classifier Bandwidth Saturation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling a 5 TFLOPS guardrail classifier with 0.5 FLOPs/Byte intensity collapse LLM throughput on the GPU?", "chain_ids": ["cloud-chain-auto-secondary-015-33"], "chain_positions": {"cloud-chain-auto-secondary-015-33": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3868", "title": "K8s Network Architecture for PyTorch DDP", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Kubernetes networking option should you choose for DDP, Multus/SR-IOV RDMA or Calico overlay, and why?", "chain_ids": ["cloud-chain-auto-021-12"], "chain_positions": {"cloud-chain-auto-021-12": 1}, "chain_tiers": {"cloud-chain-auto-021-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3869", "title": "CPU Offload vs Gradient Accumulation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture provides higher training throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3870", "title": "FP16 Dual A100 vs FP8 Single H100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which architecture yields higher generation throughput, and what precision-accuracy tradeoffs must be managed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3871", "title": "Evaluating Upgrades for Memory-Bound Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which optimization meets the 50ms SLA: upgrading to an H100 or applying INT8 weight-only quantization on the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3872", "title": "Scaling Up vs Out Queue Dynamics", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Assuming Poisson arrivals and exponentially distributed service times (modeling as independent M/M/1 queues per GPU), which option should you choose and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3873", "title": "Evaluating RDMA Kernel Bypass for Distributed Clusters", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which transport design meets the 600 ms budget for sending a 16 GB tensor, TCP/IPoIB staging or GPUDirect RDMA?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 1}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3874", "title": "Disaggregated Prefill vs Hedged Requests for TTFT", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architecture is better for guaranteeing the SLA for a prompt that produces a 2 GB KV cache?", "chain_ids": ["cloud-chain-auto-025-17"], "chain_positions": {"cloud-chain-auto-025-17": 1}, "chain_tiers": {"cloud-chain-auto-025-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3875", "title": "LLM Training FLOPs and A100 Time Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many FLOPs are needed to pre-train the 1B model on 52B tokens, and what is the theoretical A100 training time?", "chain_ids": ["cloud-chain-auto-secondary-015-06"], "chain_positions": {"cloud-chain-auto-secondary-015-06": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3876", "title": "Rack Power Budgeting for H100 vs A100 Servers", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the H100 and A100 TDPs, and how many 8-GPU servers of each fit in a 15 kW rack with 2 kW host power?", "chain_ids": ["cloud-chain-auto-021-02"], "chain_positions": {"cloud-chain-auto-021-02": 0}, "chain_tiers": {"cloud-chain-auto-021-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3877", "title": "PCIe Gen5 Transfer Time and Pinned Memory", "topic": "dma-data-movement", "competency_area": "memory", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How long should a 16 GB host-to-GPU transfer take over PCIe Gen5 x16, and why does pinned memory matter?", "chain_ids": ["cloud-chain-auto-secondary-017-17"], "chain_positions": {"cloud-chain-auto-secondary-017-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3878", "title": "VRAM Calculation for FP16 Guardrail Model", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM do the FP16 weights of the 7B guardrail require, and what percentage of an 80 GB A100 is that?", "chain_ids": ["cloud-chain-auto-secondary-015-33"], "chain_positions": {"cloud-chain-auto-secondary-015-33": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3879", "title": "Rack Cooling Limits for 8-GPU H100 vs A100 Nodes", "topic": "thermal-management", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the thermal output of 8x H100 versus 8x A100 GPUs, and how many full 8x H100 servers can a 15 kW rack cool?", "chain_ids": ["cloud-chain-auto-secondary-015-42"], "chain_positions": {"cloud-chain-auto-secondary-015-42": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3880", "title": "Calculate Model State Memory per GPU", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much per-GPU model-state memory is required for the 64B model with TP=8, PP=4, DP=16, and no ZeRO?", "chain_ids": ["cloud-chain-auto-025-08"], "chain_positions": {"cloud-chain-auto-025-08": 1}, "chain_tiers": {"cloud-chain-auto-025-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3881", "title": "KV Cache Footprint: Decoder vs Encoder-Decoder", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact KV-cache memory footprint after a 1024-token prompt (batch 32) for the decoder-only and encoder-decoder models?", "chain_ids": ["cloud-chain-auto-secondary-016-22"], "chain_positions": {"cloud-chain-auto-secondary-016-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-016-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3882", "title": "Compute Equal Opportunity Difference and Memory Read Latency", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Equal Opportunity Difference for Groups A and B, and what is the minimum HBM read time for the 10 GB dataset?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 1}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3883", "title": "SRAM Calculation for FlashAttention Tiling", "topic": "flash-attention", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given a head dimension d=128, a query block size B_r=64, and key/value block sizes B_c=128, how much SRAM in bytes is needed for the FP16 Q, K, V, and O tiles, and does it fit within the 192 KB limit?", "chain_ids": ["cloud-chain-auto-014-01"], "chain_positions": {"cloud-chain-auto-014-01": 0}, "chain_tiers": {"cloud-chain-auto-014-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3884", "title": "Calculate Logit Memory in LLM Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many GiB are required to store both teacher and student FP32 logits for the given batch, sequence length, and vocabulary size?", "chain_ids": ["cloud-chain-auto-secondary-015-29"], "chain_positions": {"cloud-chain-auto-secondary-015-29": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3885", "title": "Calculate KV-Cache Memory for 7B Model Inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much KV-cache memory in GB is required for a batch size of 16 at 1024 tokens for a 7B FP16 Transformer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3886", "title": "Calculate Mixed-Precision Memory Footprint for Adam Training", "topic": "mixed-precision-training", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much memory is required for the 7B model's FP16 weights, gradients, FP32 master weights, and Adam states, and will it fit on an 80 GB A100?", "chain_ids": ["cloud-chain-auto-secondary-015-01"], "chain_positions": {"cloud-chain-auto-secondary-015-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3887", "title": "Calculate Expert Token Capacity in Top-1 MoE Routing", "topic": "mixture-of-experts", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expert capacity per expert for batch 128, sequence length 2048, 8 experts, and capacity factor 1.25?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3888", "title": "Calculate Tensor Parallel Communication Overhead", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical communication time in milliseconds for the MLP forward All-Reduce across the 2 GPUs?", "chain_ids": ["cloud-chain-auto-022-07"], "chain_positions": {"cloud-chain-auto-022-07": 0}, "chain_tiers": {"cloud-chain-auto-022-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3889", "title": "Calculate Pipeline Bubble Overhead for 1F1B Schedule", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the total 1F1B pipeline step time and exact bubble overhead percentage for p=8 and m=32?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3890", "title": "Parquet Storage and Pipeline Bottleneck Calculation", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much Parquet storage (in GB, base-10) is required, and how long will it take to load and decompress the full 50B-row dataset?", "chain_ids": ["cloud-chain-auto-secondary-007-02"], "chain_positions": {"cloud-chain-auto-secondary-007-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3891", "title": "Coreset Selection Pipeline for LLM Pre-training at Scale", "topic": "data-efficiency-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the 15T-to-3T coreset scoring pipeline, identify its bottlenecks, and estimate scoring plus training time?", "chain_ids": ["cloud-chain-auto-secondary-015-41"], "chain_positions": {"cloud-chain-auto-secondary-015-41": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3892", "title": "H100 Cluster Power and Cooling Architecture", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which cooling architecture maximizes compute under the 25 MW facility cap, and how many nodes and racks can air vs D2C liquid support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3893", "title": "VLM-Based Automated Curation at 100B Scale", "topic": "dataset-curation", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Assuming 40% A100 MFU, what VLM scoring architecture, bottleneck, and GPU count finish 100B samples in 14 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3894", "title": "CUDA-to-HIP Porting Effort for Recommendation Model Kernels", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How would you use hipify-perl to port the 15K CUDA lines, and what manual work and timeline should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3895", "title": "ONNX Runtime Execution Provider Selection for Multi-Accelerator Inference", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many FLOPs are required for a ResNet-50 batch of 32, and why is the 2.1 TFLOP estimate wrong?", "chain_ids": ["cloud-chain-auto-secondary-005-02"], "chain_positions": {"cloud-chain-auto-secondary-005-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3896", "title": "Triton Kernel Portability from NVIDIA to AMD GPUs", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "At 55% of MI300X FP16 peak, what effective throughput is the Triton attention kernel achieving?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3897", "title": "Vendor Lock-in TCO Analysis: CUDA Moat vs Multi-Backend Investment", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you invest $400K in a multi-vendor port or accept CUDA lock-in, and what is the break-even period assuming a 50/50 workload split?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3898", "title": "SYCL Portability Layer Performance on Intel Max Series vs H100", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If SYCL loses 69 TFLOPS relative to a 910 TFLOPS native baseline, what is the performance tax percentage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3899", "title": "ONNX Graph Partitioning Across Mixed Execution Providers", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you reduce the 40% ONNX Runtime latency hit from 50 CUDA EP fallback ops among 340 operators?", "chain_ids": ["cloud-chain-auto-secondary-005-02"], "chain_positions": {"cloud-chain-auto-secondary-005-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3900", "title": "Warp Size Divergence When Porting CUDA Kernels to AMD CDNA", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What warp-size assumption makes the HIP reduction slow on 64-wide wavefronts, and how should the reduction be fixed?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3901", "title": "TPU v5e XLA Compilation Constraints for Portable Model Code", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you refactor the PyTorch model so torch.compile can run it on both GPU Inductor and TPU XLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3902", "title": "Designing a Hardware-Agnostic Kernel Dispatch Layer", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What kernel dispatch architecture would let one inference framework run on CUDA, ROCm, and XLA with under 10% overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3904", "title": "ONNX Model Compatibility Matrix Across Runtime Versions", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How should you handle an ONNX opset 18 GroupNormalization model when production ONNX Runtime 1.14 only supports opset 17?", "chain_ids": ["cloud-chain-auto-secondary-005-02"], "chain_positions": {"cloud-chain-auto-secondary-005-02": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3905", "title": "Triton's IR Compilation Pipeline for Multi-Backend Targeting", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does Triton's compilation pipeline enable portability across NVIDIA PTX and AMDGPU, and where are backend-specific choices made?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3906", "title": "ROCm Library Parity Gap for Production ML Workloads", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which components have ROCm equivalents at performance parity, and which do not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3907", "title": "Cross-Platform Model Numerics Divergence Debugging", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you determine whether the 0.003 FP16 logit difference and 0.2% quality drop are a bug or numerical noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3908", "title": "Portable Quantization Formats Across Accelerator Backends", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How is W4A16 executed on modern GPUs, and why don't you need native 4-bit matrix multiplication hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3909", "title": "CI/CD Pipeline Design for Multi-Accelerator Kernel Testing", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design CI/CD to test 80 custom kernels across H100, MI300X, and TPU v5e without dedicated hardware in CI?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 3}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3910", "title": "HIP Kernel Launch Parameter Translation from CUDA", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you translate the CUDA launch to HIP on MI300X, and what occupancy issue does the 48 KB LDS allocation create?", "chain_ids": ["cloud-chain-auto-secondary-005-01"], "chain_positions": {"cloud-chain-auto-secondary-005-01": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3911", "title": "MLIR as a Universal Compiler IR for ML Portability", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many engineers are required to support the 3 new hardware targets using MLIR, assuming 2 engineers maintain shared MLIR infrastructure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3912", "title": "TPU v5e SPMD Programming Model vs CUDA's SIMT", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the key conceptual shifts when porting a CUDA SIMT sparse attention kernel to a TPU v5e SPMD model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3913", "title": "Evaluating torch.compile Backend Portability Across Accelerators", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the portability constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3914", "title": "Multi-Backend Distributed Training: Network Bandwidth Trap", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you architect multi-vendor distributed training, and why does a 14 GB transfer take 124 ms instead of 15 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3915", "title": "ONNX Operator Coverage Gap Analysis for Transformer Models", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you fix the three GPT-2 ONNX runtime failures on DirectML?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3916", "title": "Memory Layout Portability: NCHW vs NHWC Across Accelerators", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do NCHW layouts cause a 20-30% drop on TPU v5e and MI300X, and how do you fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3917", "title": "Benchmarking Methodology for Cross-Platform Performance Claims", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What methodology do you apply to evaluate the vendor's cross-platform performance claims?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3918", "title": "Migrating cuDNN-Fused Operators to Portable Alternatives", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you recover the performance of a cuDNN-specific 3-way fusion on MI300X when MIOpen lacks an equivalent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3919", "title": "Portable Mixed-Precision Training Across GPU Architectures", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How do you make mixed-precision training portable across hardware architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3920", "title": "Assessing CUDA Vendor Lock-In and ML Stack Portability Risk", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you assess CUDA vendor lock-in risk and build a portability roadmap across the ML stack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3921", "title": "Driver Version Compatibility Hell in Multi-GPU Cloud Deployments", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the container crash with driver 525 and CUDA runtime 12.1, and how should you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3922", "title": "Portable Profiling: NCU vs rocProf vs XLA Profiler Cross-Platform Analysis", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you create comparable performance analysis across these disparate hardware platforms and profilers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3923", "title": "Compiler Fusion Differences Across XLA, TorchInductor, and MIGraphX", "topic": "software-portability", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do TorchInductor, MIGraphX, and XLA produce different fusion granularities for the same 12-layer transformer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3924", "title": "Overlapping Backward Pass with AllReduce in Data-Parallel Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why does DDP start AllReduce before the entire backward pass finishes?", "chain_ids": ["cloud-chain-auto-022-03"], "chain_positions": {"cloud-chain-auto-022-03": 0}, "chain_tiers": {"cloud-chain-auto-022-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3925", "title": "Gradient Bucket Size Tuning for Optimal Communication Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you tune DDP gradient bucket size to reduce the 40% communication overhead for the 1.3B model?", "chain_ids": ["cloud-chain-auto-022-03"], "chain_positions": {"cloud-chain-auto-022-03": 1}, "chain_tiers": {"cloud-chain-auto-022-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3926", "title": "NCCL Async Operations and CUDA Stream Orchestration for Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If 4GB of gradients are sent over a 50 GB/s link, what is the serial communication time before any overlap?", "chain_ids": ["cloud-chain-auto-022-04"], "chain_positions": {"cloud-chain-auto-022-04": 2}, "chain_tiers": {"cloud-chain-auto-022-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3927", "title": "1F1B vs GPipe — What 1F1B Actually Saves", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 4 stages and 8 micro-batches, what is the bubble fraction under GPipe versus 1F1B, and what does 1F1B actually save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3928", "title": "Double Buffering for Data Loading Overlap with GPU Compute", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How would you use double buffering to hide the 20 ms CPU-to-GPU transfer behind the 15 ms GPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3929", "title": "Prefetching Activations in Pipeline Parallelism Across Nodes", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design an activation prefetching scheme to overlap 256MB activation transfers over 400 Gbps InfiniBand with compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3930", "title": "Overlap Efficiency Degradation at Scale: 8 GPUs vs 256 GPUs", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does AllReduce overlap efficiency fall from 92% at 8 H100s to 68% at 256 H100s, and how would you fix it?", "chain_ids": ["cloud-chain-auto-022-03"], "chain_positions": {"cloud-chain-auto-022-03": 2}, "chain_tiers": {"cloud-chain-auto-022-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3931", "title": "Tensor Parallelism AllReduce Overlap with Feedforward Computation", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can standard tensor parallelism AllReduces be overlapped with computation, and what techniques partially address this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3932", "title": "ZeRO Stage 3 Communication-Computation Overlap Strategy", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you overlap ZeRO-3 parameter AllGathers with forward and backward computation across 64 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3933", "title": "CUDA Stream Priority for Communication vs Computation Scheduling", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you prioritize CUDA streams to prevent communication from blocking computation?", "chain_ids": ["cloud-chain-auto-022-04"], "chain_positions": {"cloud-chain-auto-022-04": 1}, "chain_tiers": {"cloud-chain-auto-022-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3936", "title": "Overlapping AllReduce with Optimizer Step via Gradient Sharding", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can we overlap the optimizer step with ongoing AllReduce by processing shards as their gradients are reduced?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3937", "title": "FSDP Prefetch Policy Tuning for Communication-Compute Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which FSDP prefetch policy should you use when BACKWARD_PRE OOMs but NO_PREFETCH is 40% slower, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3938", "title": "Overlapping KV Cache Transfer with Decode Computation in Serving", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you overlap transferring a 2GB KV cache over NVLink with decode on the destination GPU?", "chain_ids": ["cloud-chain-auto-022-05"], "chain_positions": {"cloud-chain-auto-022-05": 0}, "chain_tiers": {"cloud-chain-auto-022-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3939", "title": "MI300X Infinity Fabric vs H100 NVLink for Communication Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do H100 NVSwitch and MI300X Infinity Fabric topology differences affect overlap for 70B LLM training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3940", "title": "Diagnosing Failed Overlap: NCCL Blocking on Compute Stream", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are the NCCL AllReduce kernels failing to overlap with the backward computation, running sequentially instead?", "chain_ids": ["cloud-chain-auto-022-04"], "chain_positions": {"cloud-chain-auto-022-04": 0}, "chain_tiers": {"cloud-chain-auto-022-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3941", "title": "Expert Parallelism All-to-All Overlap in Mixture-of-Experts Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you overlap the 8 ms All-to-All routing with 6 ms expert compute across 32 MoE layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3942", "title": "PCIe DMA Engine Overlap for Host-Device Data Movement", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does H100's separate DMA copy engine let PCIe transfers overlap with kernel execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3943", "title": "Quantifying Overlap Efficiency with Nsight Systems Timeline Analysis", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the overlap efficiency and exposed communication time for backward [180,380] ms and NCCL [200,350] ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3944", "title": "Micro-Batch Size Impact on Pipeline Parallelism Bubble Ratio", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you choose the micro-batch count for an 8-stage pipeline with global batch 1024, balancing bubble ratio and GPU efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3945", "title": "Overlapping Dense MLP AllReduce with Sparse Embedding AlltoAll", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can the dense MLP AllReduce and the sparse embedding AlltoAll overlap, and what is the latency impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3946", "title": "TPU v5e ICI Ring Topology and Communication Overlap Opportunities", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does a 2D torus ICI topology change communication-computation overlap strategy for data-parallel training versus NVSwitch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3947", "title": "Activation Recomputation vs Activation Stashing Trade-Off for Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does this choice affect communication overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3948", "title": "Ring AllReduce Latency Model and Bandwidth Saturation Analysis", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the time complexity of ring AllReduce for N GPUs and M bytes, and why does it help communication overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3949", "title": "Overlapping Gradient Accumulation Steps with Next Batch Loading", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you prefetch data during K=4 gradient accumulation so micro-steps do not stall waiting for the next batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3950", "title": "Async Checkpointing Overlap with Training Computation", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you make 70B model checkpoints asynchronous so the 45-second save every 100 steps does not pause training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3951", "title": "Sequence Parallelism AllGather Overlap in Megatron-LM", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Megatron-LM overlap the sequence-parallel AllGather before attention/MLP blocks with computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3952", "title": "Compute-Communication Overlap in Distributed Inference with Speculative Decoding", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you overlap the draft model's K-token generation with the 4-GPU target model's previous verification?", "chain_ids": ["cloud-chain-auto-022-05"], "chain_positions": {"cloud-chain-auto-022-05": 2}, "chain_tiers": {"cloud-chain-auto-022-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3953", "title": "NCCL Graph Capture for Reducing Communication Launch Overhead", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What benefit does CUDA Graph capture provide for 200 small NCCL AllReduce calls, and what limitations must you handle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3954", "title": "Overlapping Gradient Norm Computation with AllReduce", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can you overlap the global gradient norm computation with the gradient AllReduce to eliminate its latency overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3955", "title": "Communication Overlap in Context-Parallel Long-Sequence Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you overlap KV exchange with attention compute for 128K-context training using context parallelism across 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3956", "title": "Multi-Level Overlap: Data Loading, Compute, and Communication Pipeline", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you pipeline data loading, forward/backward compute, and AllReduce so the 240 ms step approaches 200 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3957", "title": "Weight Update Sharding with Overlapped AllGather in FSDP2", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What overlap opportunities exist in FSDP2's ReduceScatter, local optimizer, and AllGather pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3958", "title": "Network Congestion Detection During Overlapped Training", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 64-GPU step time jump when AllReduce latency fluctuates from 30ms to 370ms, and how would you fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3959", "title": "Overlapping Parameter Server Pull with Forward Computation", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can this parameter server pull be overlapped with the forward pass of non-embedding layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3960", "title": "GradScaler Synchronization as a Hidden Overlap Barrier", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3961", "title": "Software Pipelining in Triton Kernels for Memory-Compute Overlap", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does Triton's num_stages software pipelining improve your H100 GEMM, and what value would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3963", "title": "Prefill-Decode Disaggregation Overlap in Production LLM Serving", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you overlap a 4GB KV-cache transfer over 100 Gbps Ethernet to minimize time-to-first-token in disaggregated serving?", "chain_ids": ["cloud-chain-auto-022-05"], "chain_positions": {"cloud-chain-auto-022-05": 1}, "chain_tiers": {"cloud-chain-auto-022-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3964", "title": "Backward-Triggered Gradient Accumulation with Deferred AllReduce", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the overlap tradeoffs of deferring DDP AllReduce to only the last micro-step for K=4 gradient accumulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3965", "title": "H100 TMA (Tensor Memory Accelerator) for Async Memory Loads", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does H100 TMA improve compute-memory overlap compared with A100's cooperative shared-memory loads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3966", "title": "Stragglers and Overlap: Why the Slowest GPU Determines Step Time", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does one slow GPU destroy overall performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3967", "title": "Communication-Aware Model Architecture Design", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you choose the transformer dimensions and tensor-parallel layout to maximize communication-computation overlap on 1024 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3968", "title": "Overlapping Collective Permute with MoE Expert Computation on TPU", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can overlap still work between ICI transfers and expert computation on TPU v5e?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3969", "title": "Bucket Fusion Order and Its Effect on First-Bucket Overlap Latency", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you form DDP gradient buckets to improve AllReduce overlap for the 1MB output layer and 25MB intermediate layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3970", "title": "Cross-Mesh Communication Overlap in 3D Parallelism", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule TP, PP, and DP communication to maximize overlap without IB contention in this 512-GPU 3D-parallel step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3971", "title": "Overlap Measurement: MFU vs HFU and Communication Accounting", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the difference between MFU 45% and HFU 58%, and what do they imply about communication-computation overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3972", "title": "Overlap in Heterogeneous Pipeline: Different GPU Types per Stage", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can communication overlap mitigate the 3x A100/H100 pipeline-stage imbalance, and what should you do instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3973", "title": "Asynchronous Local SGD as an Alternative to Overlapped AllReduce", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do Local SGD at H=4 and H=16 trade communication savings against convergence penalties compared to overlapped AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-3985", "title": "Training Budget Overrun From Sequence Length", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does switching from 2,048 to 8,192 context affect a 400,000 GPU-hour training budget when attention is 30% of step time?", "chain_ids": ["cloud-chain-auto-secondary-015-07"], "chain_positions": {"cloud-chain-auto-secondary-015-07": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-07": "secondary"}, "validated": true, "math_verified": true, "human_reviewed": {"status": "verified", "by": "expert", "date": "2026-04-28"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4009", "title": "ECMP Hash Polarization in Fat-Tree Topologies", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the root cause and what distinguishes ECMP hash polarization from simple oversubscription?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 0}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4010", "title": "PFC Deadlock in Lossless Ethernet Fabrics", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does PFC deadlock form in this Clos network, and how would you prevent full-fabric stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4011", "title": "Incast Congestion During AllReduce Reduce-Scatter", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What incast pattern is causing the 50ms reduce-scatter spikes, and why is reduce-scatter more vulnerable than all-gather?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4012", "title": "Designing ECN Thresholds for ML Traffic", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What ECN thresholds, PFC thresholds, and DCQCN rate-reduction parameters would you set for this 400G RoCE cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4013", "title": "Switch Buffer Sizing for ML Incast", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum shared buffer is needed to absorb the 8-to-1 incast burst for one 4μs RTT without packet drops?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4014", "title": "DCQCN vs TIMELY Congestion Control", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which congestion-control protocol, DCQCN or TIMELY, better serves this mixed AllReduce and pipeline-parallel workload, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4015", "title": "Buffer Management with Dynamic Thresholds", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you redesign switch buffer management so 4 elephant flows do not starve 60 mice flows on the 64-port 400G switch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4016", "title": "Congestion Spreading in Multi-Rail ML Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What rail-isolation requirements would prevent congestion on Rail 0 from degrading Rails 1-7 in the 4096-H100 8-rail cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4017", "title": "Flowlet Switching for AllReduce Load Balancing", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the inter-packet gap for 4KB packets at 400Gbps, and what flowlet timeout should you use for 256MB AllReduce messages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4018", "title": "Throughput Impact of PFC Storms", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the throughput impact of 2ms PFC storms once per minute per ToR in this 256-A100 synchronous AllReduce job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4019", "title": "Weighted ECMP for Heterogeneous Link Speeds", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does standard ECMP fail with one spine link degraded to 200G, and what routing changes are needed?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 1}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4020", "title": "Congestion Control for Pipeline-Parallel Microbatches", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design congestion control for 8MB activation transfers every 2ms to minimize 16-stage pipeline bubble time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4021", "title": "ECN Marking Accuracy at 400Gbps Line Rate", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How many ECN marking decisions per second are required at 400Gbps with 4KB packets, and why must the algorithm be simple?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4022", "title": "Victim Flow Starvation Under PFC", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does checkpoint traffic using only 15% of fabric bandwidth cause a 70% AllReduce throughput drop with PFC enabled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4023", "title": "Congestion-Aware Adaptive Routing on InfiniBand", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should Adaptive Routing be enabled for the mix of 90% AllReduce and 10% parameter-server traffic, and how should it be configured?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 3}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4024", "title": "Network Calculus for ML Traffic Guarantees", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Using network calculus concepts, what worst-case end-to-end delay bound do you derive for an 8MB activation tensor traversing 6 Clos hops with 2MB buffers?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4025", "title": "NCCL Network Plugin Congestion Tuning", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How did lowering NCCL_IB_TIMEOUT to 14 and NCCL_IB_RETRY_CNT to 3 cause retransmissions, and what values should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4026", "title": "Slow Receiver Problem in RoCE ML Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does one MI300X NIC limited to 200Gbps affect a 64-GPU ring AllReduce, and why is this worse than in a tree?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4027", "title": "Micro-Burst Detection for ML Traffic", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What 400Gbps micro-burst can hide inside 5-second switch-counter samples, and how would you detect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4028", "title": "Congestion Control at 800G and Beyond", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can PFC, DCQCN, and ECN scale to 800Gbps links with 4KB packets, or is a fundamental redesign needed?", "chain_ids": ["cloud-chain-auto-020-12"], "chain_positions": {"cloud-chain-auto-020-12": 3}, "chain_tiers": {"cloud-chain-auto-020-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4029", "title": "Phantom Congestion from NCCL Tree AllReduce", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does tree AllReduce generate 40% more ECN-marked packets than ring AllReduce despite the same total data volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4030", "title": "Cross-Job Congestion Isolation in Multi-Tenant Clusters", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the network isolation architecture to prevent one job's AllReduce congestion from degrading another's performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4031", "title": "TPU v5e Congestion in ICI Fabric", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does congestion management for AllReduce differ on TPU v5e's ICI torus versus a switched Clos GPU fabric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4032", "title": "End-to-End Congestion Budget for Training Iteration", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate the <5% communication-overhead SLA budget across ECMP imbalance, ECN/DCQCN, PFC, retransmissions, and software?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4033", "title": "ECMP vs Adaptive Routing Tradeoff Space", "topic": "congestion-control", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When is static ECMP preferable to Dynamic Load Balancing on Tomahawk-5 for MI300X training traffic, considering overhead?", "chain_ids": ["cloud-chain-auto-020-11"], "chain_positions": {"cloud-chain-auto-020-11": 2}, "chain_tiers": {"cloud-chain-auto-020-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4034", "title": "Ring AllReduce Bandwidth Formula", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the ring AllReduce time formula for N GPUs, bandwidth B, and gradient size D, and why is it near bandwidth-optimal?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 0}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4035", "title": "Ring vs Tree AllReduce Latency Tradeoff", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For 1024 MI300X GPUs synchronizing a 50MB tensor, which NCCL AllReduce algorithm is faster and where is the crossover point?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 1}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4036", "title": "Gradient Compression with Top-K Sparsification", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you use Top-K gradient sparsification at 1% density for 4GB gradients, and what happens to the dropped 99%?", "chain_ids": ["cloud-chain-auto-023-12"], "chain_positions": {"cloud-chain-auto-023-12": 1}, "chain_tiers": {"cloud-chain-auto-023-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4037", "title": "Bucket Fusion in NCCL AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With NCCL_BUFFSIZE=4MB, how many fused AllReduce operations does ResNet-152 need, and what speedup comes from reduced launch overhead?", "chain_ids": ["cloud-chain-auto-023-13"], "chain_positions": {"cloud-chain-auto-023-13": 0}, "chain_tiers": {"cloud-chain-auto-023-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4038", "title": "Diagnosing Gradient Staleness in Async SGD", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does average staleness of 12 iterations hurt async SGD convergence for a 13B LLM, and what staleness is tolerable?", "chain_ids": ["cloud-chain-auto-023-14"], "chain_positions": {"cloud-chain-auto-023-14": 0}, "chain_tiers": {"cloud-chain-auto-023-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4039", "title": "Hierarchical AllReduce Design for Multi-Rack Clusters", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What hierarchical AllReduce stages and algorithms would you use for the 512-GPU, 64-node, 8-rack cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4040", "title": "MoE Token Routing All-to-All vs AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the MoE All-to-All token routing pattern across 128 MI300X GPUs, and how does its bandwidth compare to AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4041", "title": "Gradient Quantization to INT8", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What communication time savings and quantization error result from INT8-quantizing the 2 GiB FP16 gradients before AllReduce?", "chain_ids": ["cloud-chain-auto-023-12"], "chain_positions": {"cloud-chain-auto-023-12": 0}, "chain_tiers": {"cloud-chain-auto-023-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4042", "title": "AllReduce Communication Hiding with Computation Overlap", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With a 200ms backward pass and 60ms AllReduce on 128 H100s, what maximum overlap and iteration time can you achieve?", "chain_ids": ["cloud-chain-auto-023-13"], "chain_positions": {"cloud-chain-auto-023-13": 1}, "chain_tiers": {"cloud-chain-auto-023-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4043", "title": "Local SGD vs AllReduce at Scale", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Evaluate wall-clock time, convergence, and network utilization to determine whether AllReduce or Local SGD is superior for this run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4044", "title": "Recursive Halving-Doubling AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How does recursive halving-doubling latency compare to ring AllReduce, and why is it preferred for small messages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4045", "title": "Gradient Accumulation as Communication Reduction", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 4x micro-batch accumulation on 64 GPUs, what are the effective batch size, communication overhead reduction, and memory cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4046", "title": "PowerSGD Low-Rank Gradient Compression", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "For PowerSGD rank 4 on a 4096×4096 gradient, what are the compression ratio and AllReduce savings on 256 GPUs?", "chain_ids": ["cloud-chain-auto-023-12"], "chain_positions": {"cloud-chain-auto-023-12": 2}, "chain_tiers": {"cloud-chain-auto-023-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4047", "title": "Gradient Synchronization for ZeRO Stage 3", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What is the ZeRO-3 per-layer communication schedule and cost for a 64-GPU, 30B-parameter, 60-layer training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4048", "title": "Detecting Silent Gradient Corruption", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you detect which GPU or network path is producing corrupted gradients?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4049", "title": "AllReduce Bandwidth Efficiency on NVLink vs RoCE", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For a 64KB gradient, what are the intra-node NVLink and inter-node RoCE AllReduce times, and where is the latency crossover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4050", "title": "SHARP In-Network AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For 512 H100s, what benefits and limitations does SHARP have versus host-based NCCL AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4051", "title": "Mixed-Precision Gradient Synchronization Pipeline", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you redesign the BF16-gradient Adam pipeline to minimize AllReduce communication while preserving FP32 optimizer precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4052", "title": "Gradient Synchronization for FSDP with Activation Checkpointing", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "With activation checkpointing, what total FSDP communication volume and scheduling constraints apply for the 65B LLM on 256 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4053", "title": "Gradient Synchronization Under Stragglers", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How many GPU-hours are wasted over 24 hours, and how would you mitigate this without hurting gradient quality?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4054", "title": "AllReduce Algorithm Selection in NCCL", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which AllReduce algorithm should you choose for 64KB, 256MB, and 4GB gradients on 2048 MI300X GPUs, and why under LogP?", "chain_ids": ["cloud-chain-auto-023-11"], "chain_positions": {"cloud-chain-auto-023-11": 3}, "chain_tiers": {"cloud-chain-auto-023-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4055", "title": "TPU v5e AllReduce over ICI Torus", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How does 1GB AllReduce time on the torus compare with 256 H100s on 400Gbps Ethernet, and why does topology matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4056", "title": "Gradient Clipping Interaction with AllReduce", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What overhead does the scalar norm AllReduce add for global gradient clipping on 256 MI300X GPUs, and can it be overlapped?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4057", "title": "DiLoCo: Distributed Low-Communication Training", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does DiLoCo with H=500 compare with synchronous AllReduce for a 70B model on 256 GPUs in communication, time, and convergence?", "chain_ids": ["cloud-chain-auto-023-14"], "chain_positions": {"cloud-chain-auto-023-14": 2}, "chain_tiers": {"cloud-chain-auto-023-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4058", "title": "AllReduce on Non-Power-of-Two GPU Counts", "topic": "gradient-synchronization", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What AllReduce strategy should you use for 497 GPUs, and why might ring still beat a non-power-of-two recursive scheme?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4059", "title": "InfiniBand Architecture Fundamentals", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are HCAs, subnet managers, queue pairs, and virtual lanes in InfiniBand, and how do they enable RDMA on a 256-H100 cluster?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 0}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4060", "title": "RoCE v2 vs InfiniBand for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 1024-MI300X cluster, how do InfiniBand NDR and RoCE v2 compare in congestion control, multi-tenancy, operations, and recovery?", "chain_ids": ["cloud-chain-auto-020-01"], "chain_positions": {"cloud-chain-auto-020-01": 1}, "chain_tiers": {"cloud-chain-auto-020-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4061", "title": "Zero-Copy RDMA for Gradient Transfer", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What zero-copy GPUDirect RDMA data path would you use for gradient AllReduce between two GPUs in different nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4062", "title": "Kernel Bypass and Verbs API Overhead", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 65,536 TCP packets carrying a 256MB transfer, how much context-switch and kernel overhead does RDMA bypass save?", "chain_ids": ["cloud-chain-auto-020-02"], "chain_positions": {"cloud-chain-auto-020-02": 0}, "chain_tiers": {"cloud-chain-auto-020-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4063", "title": "GPUDirect RDMA Memory Registration Overhead", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is NCCL memory registration causing the 500ms startup delay, and what are the correct MTT and QP setup costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4064", "title": "RDMA Write vs RDMA Send for AllReduce", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why does NCCL prefer RDMA Write over Send/Receive for a 256MB gradient transfer, and how much overhead does it avoid?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4065", "title": "PCIe Bandwidth Bottleneck for GPUDirect RDMA", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the GPUDirect RDMA bottleneck on H100 SXM, and how much headroom remains between PCIe Gen5 x16 and a 400Gbps NIC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4066", "title": "Designing RDMA Buffer Pool for AllReduce Pipelining", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you allocate the RDMA buffer pool for a 2GB, 8-stage pipelined ring AllReduce on 256 H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4067", "title": "RDMA Completion Queue Polling vs Interrupts", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 1000 AllReduce iterations per second on 64 GPUs, what CPU-cycle overhead does CQ polling use versus interrupts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4068", "title": "RoCE v2 Packet Format and GRH Overhead", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a 4KB RDMA Write payload, what are the RoCE v2 header overhead and payload efficiency compared with native InfiniBand?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4069", "title": "GPUDirect RDMA Page Table Alignment", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 2KB alignment hurt MI300X GPUDirect RDMA throughput, and how much loss comes from PCIe TLP splitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4070", "title": "RDMA Reliable Connection vs Unreliable Datagram", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For AllReduce on 512 GPUs, how do InfiniBand RC and UD transports trade off QP scalability, failures, and throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4071", "title": "RDMA Memory Registration with On-Demand Paging", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What GPUDirect RDMA on-demand paging architecture would let PyTorch allocate GPU memory dynamically without pre-registration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4072", "title": "GPUDirect RDMA vs GPUDirect Storage Data Paths", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do concurrent GPUDirect Storage checkpoint saves affect AllReduce throughput on a 64-MI300X cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4073", "title": "RDMA Multi-Path with Bonded NICs", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you use all 8 ConnectX-7 rails for a 2GB AllReduce while handling rail failures and preserving reduction ordering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4074", "title": "RDMA Latency Breakdown for a Single Write", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency breakdown for a 64-byte GPUDirect RDMA Write between two H100 nodes through one IB switch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4075", "title": "NIC-Level Traffic Shaping for RDMA Fairness", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design NIC-level traffic shaping so Job A's AllReduce does not starve Job B's latency-sensitive activation transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4076", "title": "iWARP vs RoCE v2 for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How suitable is iWARP versus RoCE v2 for large-scale ML training, considering latency, throughput, CPU overhead, and operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4077", "title": "RDMA Atomic Operations for Distributed Locking", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Under 256-way contention on a single RDMA CAS lock, what is the maximum lock throughput and why do atomics bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4078", "title": "Ultra Ethernet Consortium for ML Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would UEC's packet spraying, selective retransmission, and multipath transport compare with RoCE v2 and InfiniBand at 8192 MI300X GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4079", "title": "RDMA Error Recovery in Long-Running Training Jobs", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What tiered RDMA error recovery strategy lets a 90-day, 1024-H100 training run continue through QP, path, and node failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4080", "title": "GDRCopy for Small RDMA Messages", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 64-byte NCCL control messages, how do GDRCopy and GPUDirect RDMA latencies compare, and why use GDRCopy anyway?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4081", "title": "RDMA Connection Scaling for 16,384 GPUs", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the QP state memory and setup-time costs for naive RDMA at 16,384 GPUs, and what sub-quadratic design would you use?", "chain_ids": ["cloud-chain-auto-020-03"], "chain_positions": {"cloud-chain-auto-020-03": 1}, "chain_tiers": {"cloud-chain-auto-020-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4082", "title": "RDMA Performance Isolation with SR-IOV", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the SR-IOV RDMA overhead versus bare metal on one 400G ConnectX-7 NIC, and how many tenants can share it above 90% AllReduce throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4083", "title": "RDMA with CXL Memory Expansion for ML", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is CXL-attached memory a good RDMA gradient-buffer design for a 512-GPU cluster, and why or why not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4084", "title": "Column-Parallel Linear Layer Communication Pattern", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "After a column-parallel linear layer on 8xMI300X, what collective, if any, is needed before the next layer can consume the partial outputs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4085", "title": "AllReduce Volume for an MI300X Transformer Block", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For hidden=8192, TP=8, and batch=2048 on MI300X, what are the per-AllReduce and summed per-block volumes and times?", "chain_ids": ["cloud-chain-auto-022-07"], "chain_positions": {"cloud-chain-auto-022-07": 1}, "chain_tiers": {"cloud-chain-auto-022-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4086", "title": "Why TP Stays Within a Node", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is this almost certainly a bad idea, and what would you recommend instead?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 0}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4087", "title": "TP Layout Under an Asymmetric Bandwidth Budget", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "On 4 GPUs with a 2x NVLink pair + 2x PCIe pair, should the [8192, 8192] linear block use column-parallel + AllGather or row-parallel + AllReduce, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4088", "title": "Sequence Parallelism in Megatron-LM", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For seq_len=8192, batch=4, hidden=8192, TP=8, how much activation memory is wasted, and how does sequence parallelism fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4089", "title": "TP Degree Selection for Inference", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use TP=1 with a single GPU or TP=2 across two GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4090", "title": "AllReduce vs AllGather Cost in TP", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For M bytes across P GPUs, what are the ring AllReduce vs AllGather communication volumes, and which operations do Megatron's row and column parallel layers use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4091", "title": "TP and Attention Head Partitioning", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With 32 attention heads on 8-GPU nodes, can you use TP=8 or TP=6, and what head-divisibility constraint governs TP degree?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4092", "title": "TP Memory Savings Calculation", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With a 30B BF16 model and TP=4 on A100-80GB GPUs, how much weight and AdamW optimizer-state memory does each GPU hold?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4093", "title": "TP Bubble: Synchronization Overhead in Practice", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What two concrete strategies would you use to reduce the 15% TP AllReduce overhead without changing TP=8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4094", "title": "GQA Impact on Tensor Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How are the KV heads distributed, and what problem arises if you try TP=16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4095", "title": "TP vs Expert Parallelism for MoE Models", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which parallelism strategy is superior for this MoE model and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4096", "title": "Embedding Table Parallelism Strategy", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "For a 3.15 GB embedding table with TP=8, what are two valid sharding strategies and their tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4097", "title": "Megatron Fused Block Backward TP Communication", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "In Megatron's fused column->row block, what TP communications occur in backward, and what are the per-block and full-step volumes?", "chain_ids": ["cloud-chain-auto-022-07"], "chain_positions": {"cloud-chain-auto-022-07": 2}, "chain_tiers": {"cloud-chain-auto-022-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4098", "title": "TP with Activation Checkpointing Interaction", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "During full activation checkpointing with TP=8, which forward TP collectives are recomputed in backward, and does TP communication increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4099", "title": "MLP Timing Imbalance After Correct TP Sharding", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "If TP=8 partitions the 48-head model correctly, what can cause slight MLP execution-time imbalance across MI300X ranks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4100", "title": "TP + DP Hybrid Parallelism Layout", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should TP and DP process groups be laid out, and where does each type of communication occur?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4101", "title": "TP Scaling Efficiency Cliff", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 13B model on H100 NVLink, how does MFU change from TP=1 to TP=8, and at what TP degree does efficiency drop sharply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4102", "title": "TP for Cross-Attention in Encoder-Decoder Models", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "With TP=4 and partitioned encoder outputs, how should cross-attention KV projections be handled without adding extra communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4103", "title": "Debugging TP Numerical Divergence", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "After switching from TP=1 to TP=4, what are the most likely causes of loss divergence after 500 steps, and how would you systematically debug it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4104", "title": "Context Parallelism vs Sequence Parallelism", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the difference between sequence parallelism and context parallelism, and when would you use each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4105", "title": "TP Communication Overlap with NVLink", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What CUDA stream setup is needed to overlap TP AllReduce and GEMM on the 8x A100 DGX node?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4106", "title": "TP Degree for Inference Latency SLA", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the minimum TP degree needed to meet this SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4107", "title": "TP Weight Sharding Memory Layout", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "For TP=8 inference, how are Megatron-style TP-sharded checkpoints stored and what are the two ways to load each GPU's shard?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4108", "title": "405B Training Parallelism Layout", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What TP, PP, and DP layout would you design for 405B training on 128 GPUs, and what throughput should you expect?", "chain_ids": ["cloud-chain-auto-022-06"], "chain_positions": {"cloud-chain-auto-022-06": 3}, "chain_tiers": {"cloud-chain-auto-022-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4109", "title": "Backfill Scheduling for Small GPU Jobs", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What scheduling policy would reduce the 4-hour FIFO wait for small inference jobs, and what tradeoff does it introduce?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 0}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4110", "title": "Bin-Packing GPUs for Mixed Workloads", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does first-fit cause topology fragmentation for 1-, 2-, 4-, and 8-GPU jobs, and how does bin-packing improve utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4111", "title": "MIG Partitioning for Inference Multiplexing", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you use MIG to partition 4 A100-80GB GPUs to run 12 concurrent <7B model instances with isolation?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 2}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4112", "title": "Gang Scheduling for Distributed Training", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What scheduling concept prevents this, and why is it critical for distributed training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4113", "title": "Fair-Share Scheduling Across Teams", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should fair-share scheduling handle Team B borrowing Team A's idle GPUs, and what happens when Team A submits a large job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4114", "title": "GPU Preemption for Priority Inference", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you preempt training jobs to free 8 MI300X GPUs for urgent inference while minimizing wasted compute and meeting the SLA?", "chain_ids": ["cloud-chain-auto-021-06"], "chain_positions": {"cloud-chain-auto-021-06": 2}, "chain_tiers": {"cloud-chain-auto-021-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4115", "title": "GPU Time-Slicing vs MIG vs MPS", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do CUDA time-slicing, MIG, and MPS differ in isolation, overhead, and best use case?", "chain_ids": ["cloud-chain-auto-021-04"], "chain_positions": {"cloud-chain-auto-021-04": 1}, "chain_tiers": {"cloud-chain-auto-021-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4116", "title": "Spot Instance Strategy for Training Resilience", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What checkpointing strategy maximizes cost savings while minimizing wasted compute for a 13B model on 32 spot instances?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4117", "title": "Multi-Tenant GPU Cluster Quota Design", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design quotas, scheduling policies, and safeguards for a 500-GPU cluster shared by 8 ML teams?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4118", "title": "Topology-Aware Scheduling on DGX H100", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does NVLink topology matter for placement on this node, or are all 4-GPU subsets equivalent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4119", "title": "GPU Memory Oversubscription Dangers", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If two 30 GB and 35 GB models share one 80 GB GPU and see concurrent peak traffic, what happens and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4120", "title": "Scheduler Interaction with NCCL Timeout", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can stale CUDA processes on 4 allocated GPUs cause a 64-GPU NCCL timeout, and what scheduler checks prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4121", "title": "Elastic Training with Dynamic GPU Scaling", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you safely shrink the 64-GPU training job to 48 GPUs without restarting from scratch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4122", "title": "Diagnosing Low GPU SM Utilization and Right-Sizing", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are three common causes of <20% GPU utilization, and what right-sizing action would you take for each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4123", "title": "Scheduling Deadline-Aware Training Jobs", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can 128 GPUs train the 70B model on 1T tokens within 14 days, and how would you schedule around the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4124", "title": "Thermal-Limited GPU Power Capping", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you adjust GPU scheduling to stay within the 250 kW thermal rejection limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4125", "title": "Fragmentation-Aware Scheduling Policy", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What scheduling policy would prevent the 16-GPU job from being blocked by fragmented free GPUs?", "chain_ids": ["cloud-chain-auto-021-07"], "chain_positions": {"cloud-chain-auto-021-07": 2}, "chain_tiers": {"cloud-chain-auto-021-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4126", "title": "Multi-Cluster GPU Federation Scheduling", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a federated scheduler to route jobs across the clusters based on job characteristics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4127", "title": "Job Priority Inversion in GPU Clusters", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you resolve this without wasting the training compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4128", "title": "GPU Health Monitoring for Scheduler Integration", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which five GPU health metrics should the scheduler monitor, and what thresholds should trigger draining a GPU from the pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4129", "title": "MIG vs Full GPU for Inference Cost Optimization", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does the cost per token compare between a full GPU and 3g.40gb MIG partitions for the 7B service?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4130", "title": "Scheduling Multi-Node Training with Network Constraints", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the scheduler consider network topology to optimize the 128-GPU job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4131", "title": "GPU Lease Duration and Scheduling Efficiency", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What wall-time policy would improve efficiency when 7-day MI300X jobs only use 15% of their reserved runtime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4132", "title": "Scheduling for Heterogeneous GPU Clusters", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a heterogeneous scheduler to maximize throughput-per-dollar across H100, MI300X, and A100 GPUs for a mixed workload of LLM training, inference, and fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4133", "title": "Slurm H100 GRES and MIG Scheduling Configuration", "topic": "scheduling-resource-management", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What key slurm.conf and gres.conf entries enable GPU-aware, topology-aware scheduling for the 10-node H100 cluster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4134", "title": "Kubernetes GPU Device Plugin Architecture", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "When a pod requests nvidia.com/gpu: 2, what is the sequence of events from pod scheduling to GPU availability inside the container?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4135", "title": "Kubernetes GPU Scheduling with Node Affinity and Tolerations", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What pod spec would ensure the FP8 training job is placed only on appropriate nodes with the right tolerations and GPU requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4136", "title": "Gang Scheduling in Kubernetes with Volcano", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does Volcano scheduler solve this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4137", "title": "MIG Partitioning in Kubernetes", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you configure MIG on the node and have pods request the specific 3g.40gb partitions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4138", "title": "Kubernetes GPU Scheduling with Limits Only", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why this asymmetry, and what does it mean for GPU scheduling?", "chain_ids": ["cloud-chain-auto-021-11"], "chain_positions": {"cloud-chain-auto-021-11": 1}, "chain_tiers": {"cloud-chain-auto-021-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4139", "title": "Multi-Node Training Job Orchestration in K8s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which orchestration approach is most appropriate for the multi-node PyTorch job and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4140", "title": "GPU Node Autoscaling in Kubernetes", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design autoscaling to handle traffic spikes without 5-minute cold starts?", "chain_ids": ["cloud-chain-auto-021-13"], "chain_positions": {"cloud-chain-auto-021-13": 1}, "chain_tiers": {"cloud-chain-auto-021-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4141", "title": "Container Image Optimization for GPU ML Workloads", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you reduce the pod startup time to under 1 minute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4142", "title": "RDMA and Host Networking for ML Training Pods", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the Kubernetes networking bottleneck limiting NCCL to 15 GB/s, and how would you configure K8s for full InfiniBand bandwidth?", "chain_ids": ["cloud-chain-auto-021-12"], "chain_positions": {"cloud-chain-auto-021-12": 2}, "chain_tiers": {"cloud-chain-auto-021-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4143", "title": "K8s Pod Failure Semantics for Distributed Training", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What happens next to the other 31 pods, and how should the PyTorchJob be configured to handle this correctly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4144", "title": "Persistent Volume Strategy for Training Checkpoints", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you configure PersistentVolumes for 26 GB checkpoints every 30 minutes over a 3-day, 32-GPU job?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4145", "title": "GPU Operator vs Manual Driver Management in K8s", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the GPU Operator manage that manual installation does not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4146", "title": "K8s Scheduling Latency for Real-Time Inference", "topic": "container-orchestration", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What caused the 30-second 503 gap during rolling updates, and how would you deploy GPU inference pods with zero downtime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4147", "title": "P99 vs P50 Divergence Under Load", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is P50-to-P99 divergence a poor signal for capacity planning, and what root causes should you investigate before scaling out?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4148", "title": "Little's Law for GPU Inference Throughput", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using Little's Law, how many concurrent requests must be in-flight, and what does this imply for GPU memory reserved for KV cache?", "chain_ids": ["cloud-chain-auto-025-16"], "chain_positions": {"cloud-chain-auto-025-16": 0}, "chain_tiers": {"cloud-chain-auto-025-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4149", "title": "Hedged Requests in Distributed Inference", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected P99 improvement for hedged requests, and what is the cost in wasted GPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4150", "title": "Load Balancing Algorithms and Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does round-robin fail here, and what load-balancing strategy would you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4151", "title": "End-to-End P99 in Multi-Stage Pipelines", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can end-to-end P99 exceed 200ms, and how would you allocate per-stage tail-latency budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4152", "title": "Prefill-Decode Latency Decomposition", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For the 4096-token prompt and 256-token generation, what are total latency and TTFT, and how do prefill and decode optimizations differ?", "chain_ids": ["cloud-chain-auto-025-17"], "chain_positions": {"cloud-chain-auto-025-17": 0}, "chain_tiers": {"cloud-chain-auto-025-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4153", "title": "Coordinated Omission in Latency Measurement", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is coordinated omission, and how does the benchmark's pause-while-waiting behavior mask true tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4154", "title": "Tail Tolerance with Backup Requests at Scale", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design backup requests to bring P999 below 200ms on the 64-GPU MI300X service, and what extra GPU cost would that add?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4155", "title": "Queueing Theory Applied to GPU Batch Scheduling", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected batching wait at 20ms, and why can a 50ms batch window improve P99 despite more queueing?", "chain_ids": ["cloud-chain-auto-025-16"], "chain_positions": {"cloud-chain-auto-025-16": 1}, "chain_tiers": {"cloud-chain-auto-025-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4156", "title": "Fan-Out Tail Amplification in Mixture-of-Experts", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 128-token MoE batch amplify expert tail latency, and how do you mitigate it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4157", "title": "Tail Latency Budgeting Across Microservices", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What per-service latency budgets and alert thresholds would you set for the tokenizer, H100 inference, and post-processing services?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4158", "title": "NUMA Effects on Inference Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware topology issue explains this bimodal distribution, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4159", "title": "Adaptive Timeout Design for LLM Serving", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design an adaptive timeout strategy that protects long requests and catches short failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4160", "title": "Goodput vs Throughput Under Tail Constraints", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does goodput drop when batch size increases from 32 to 64, and what batch size should you choose?", "chain_ids": ["cloud-chain-auto-025-16"], "chain_positions": {"cloud-chain-auto-025-16": 2}, "chain_tiers": {"cloud-chain-auto-025-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4161", "title": "Tail Latency Impact of Garbage Collection", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does GC disproportionately affect tail latency in ML serving, and what are your mitigation options?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4162", "title": "Little's Law Under Variable Service Times", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What average concurrency does Little's Law predict, and why does service-time variance make P99 queue depth much worse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4163", "title": "Speculative Decoding as Tail Latency Reducer", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "With 5 draft tokens at a 72% acceptance rate, what are the effective accepted tokens per step, the latency per accepted token, and the total speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4164", "title": "Cross-Region Tail Latency in Geo-Distributed Serving", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the 3-region MI300X serving system to meet a global P99 SLA of 150 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4165", "title": "Request Coalescing vs Tail Latency Tradeoff", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Under a latency-weighted goodput model, when does request coalescing become net-negative?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4166", "title": "Continuous Batching and Tail Latency Reduction", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L2", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using PagedAttention, what is the size of a single 16-token KV-cache page for a model with 80 layers, 16 KV heads, 128 head dimension, and FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4167", "title": "Fat-Tree Bisection Bandwidth for AllReduce", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the ring AllReduce time for a 1 GB gradient on 256 GPUs over 400 Gbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4168", "title": "NVLink vs Infinity Fabric for Tensor Parallelism", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do NVLink and Infinity Fabric differ for tensor parallelism, and which is better for the given shard and AllReduce sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4169", "title": "Dragonfly Topology for Large-Scale Training", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which topology better matches the hierarchical communication pattern, and what is the cost difference in switch ports?", "chain_ids": ["cloud-chain-auto-026-09"], "chain_positions": {"cloud-chain-auto-026-09": 1}, "chain_tiers": {"cloud-chain-auto-026-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4170", "title": "Torus Topology and Nearest-Neighbor Communication", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you compute the 3D torus bisection bandwidth for the 256-chip TPU pod, and why is it 12.8 Tbps rather than 6.4 Tbps?", "chain_ids": ["cloud-chain-auto-026-09"], "chain_positions": {"cloud-chain-auto-026-09": 0}, "chain_tiers": {"cloud-chain-auto-026-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4171", "title": "CXL Memory Pooling for Inference Serving", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you tier the 405B model's weights and KV cache between local HBM and the 2 TB CXL pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4172", "title": "NVSwitch Full-Mesh vs PCIe Hierarchy", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What topology-adjusted AllReduce bandwidth ratio separates NVSwitch from PCIe, and which workloads justify the premium?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4173", "title": "Effective Bandwidth for Mixed HBM and DRAM Accesses on MI300X", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the effective bandwidth for memory accesses that hit HBM (5.3 TB/s) 60% of the time and DRAM (0.8 TB/s) 40% of the time, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4174", "title": "Network Topology for Pipeline Parallelism", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should you map pipeline stages to GPUs, and what bandwidth should you use for a single InfiniBand NDR link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4175", "title": "Rail-Optimized Network for GPU Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the rail-optimized topology, what is its switch count vs a fat-tree, and which AllReduce algorithm maps best to it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4176", "title": "Network Partitioning for Failure Isolation", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which 3D parallelism dimension is most affected by the 512-GPU partition, and how would you design the topology to contain failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4177", "title": "InfiniBand vs RoCE v2 for ML Training", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the ring AllReduce time for a 1 GB buffer on 256 GPUs over 400 Gbps links?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4178", "title": "Topology-Aware AllReduce Algorithm Selection", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare ring AllReduce, hierarchical AllReduce, and NCCL's tree AllReduce; which algorithm minimizes total AllReduce time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4179", "title": "Cross-Rack Bandwidth Planning for Data Parallelism", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Will the cross-rack bandwidth support AllReduce, and if not, how do you fix it without adding spine bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4180", "title": "Network Topology Impact on Checkpoint I/O", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the checkpoint write time to the 2×100 GbE NFS server, and what topology-aware checkpointing strategy reduces stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4181", "title": "Multi-Tenant Network Isolation on Shared Clusters", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition the network and set QoS so training AllReduce bursts do not cause inference latency spikes?", "chain_ids": ["cloud-chain-auto-026-07"], "chain_positions": {"cloud-chain-auto-026-07": 3}, "chain_tiers": {"cloud-chain-auto-026-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4182", "title": "4-GPU Tensor Parallel AllReduce on Mixed NVLink and PCIe Gen4", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 4-way tensor-parallel AllReduce time for a 100 MB activation tensor, and which link is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4183", "title": "Adaptive Routing in Dragonfly Under Adversarial Traffic", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why can a 1024-GPU 4 GB AllReduce in a shared dragonfly create global-link hot spots and interfere with a 256-GPU job?", "chain_ids": ["cloud-chain-auto-026-09"], "chain_positions": {"cloud-chain-auto-026-09": 2}, "chain_tiers": {"cloud-chain-auto-026-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4184", "title": "NVLink Domain Size and AllReduce Scaling Limits", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do 8-GPU and 72-GPU NVLink AllReduce times compare for a 405B model, and why do returns diminish beyond 72?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4185", "title": "SM Occupancy vs Achieved Throughput", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can a 50%-occupancy H100 kernel reach 85% peak TFLOPS while a 75%-occupancy kernel reaches only 40%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4186", "title": "Warp Divergence in Attention Masking", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Explain why causal masking causes warp divergence and how FlashAttention's tiling strategy eliminates it. With thread coarsening at stride 2048, which sequence positions does thread 49 process?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4187", "title": "Tensor Core Utilization vs CUDA Core Fallback", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the unfused runtime breakdown for the 4096 x 4096 matmul plus GELU, and why does fusion matter?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 0}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4188", "title": "Warp Scheduling and Latency Hiding", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With 100 active cycles in a 310-cycle interval, what memory-stall fraction does the kernel experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4189", "title": "Tensor Core Matrix Multiply Tile Sizes", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many FLOPs do 352,256 Tensor Core tiles at 8,192 FLOPs each execute, and is the result measured in GFLOPs or TFLOPs?", "chain_ids": ["cloud-chain-auto-secondary-010-22"], "chain_positions": {"cloud-chain-auto-secondary-010-22": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4190", "title": "Memory Coalescing in Attention Kernels", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does column-wise access to row-major Q/K data destroy coalescing on MI300X, and how do you fix it?", "chain_ids": ["cloud-chain-auto-secondary-010-21"], "chain_positions": {"cloud-chain-auto-secondary-010-21": 1}, "chain_tiers": {"cloud-chain-auto-secondary-010-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4192", "title": "Thread Block Size Optimization for H100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the max blocks per SM and occupancy at 128 threads per block, and would 256 threads per block improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4193", "title": "Kernel Fusion Strategy for Transformer Blocks", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you fuse the 12 transformer-block kernels to minimize HBM round-trips without breaking Tensor Core tile efficiency?", "chain_ids": ["cloud-chain-auto-secondary-010-21"], "chain_positions": {"cloud-chain-auto-secondary-010-21": 2}, "chain_tiers": {"cloud-chain-auto-secondary-010-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4194", "title": "MI300X Wave64 vs H100 Warp32 Architecture", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural differences explain why MI300X's 304 CUs excel at large batch training while H100's 132 SMs excel at low-latency inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4195", "title": "H100 Persisting L2 Hints for Multi-Model Serving", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you use H100 L2 residency controls under MPS to protect Model A's P99 latency without hard cache partitioning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4196", "title": "Wave Quantization in Kernel Launch", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the efficiency loss of the 140-block kernel, and what is the optimal number of thread blocks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4197", "title": "FP8 Tensor Core Precision on H100 and MI300X", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What H100 FP8 throughput should you use for mixed-precision training when FP16 is 989 TFLOPS, and which FP8 format is best for gradients?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4198", "title": "Persistent Kernels for Continuous Batching", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a persistent MI300X kernel for continuous batching, handling work queues and variable batch sizes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4199", "title": "Feature Store Online vs Offline Latency Gap", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does this fail, and what latency math definitively proves it is unviable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4200", "title": "Feature Freshness vs Staleness Budget", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the staleness problem and what freshness architecture fixes it without rebuilding the entire pipeline?", "chain_ids": ["cloud-chain-auto-020-15"], "chain_positions": {"cloud-chain-auto-020-15": 1}, "chain_tiers": {"cloud-chain-auto-020-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4201", "title": "Point-in-Time Correctness for Training", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What data leakage pattern is occurring, and how do you architect point-in-time correct feature retrieval?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 1}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4202", "title": "Feature Store Serving Throughput Under Load", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total Redis operations per second does this ad ranking feature workload require, can one 300K ops/s instance handle it, and how would you scale?", "chain_ids": ["cloud-chain-auto-020-14"], "chain_positions": {"cloud-chain-auto-020-14": 1}, "chain_tiers": {"cloud-chain-auto-020-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4204", "title": "Feature Store Schema Evolution", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you manage this schema evolution without breaking production models or requiring simultaneous retraining of all 12?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4205", "title": "Feature Store Infrastructure Cost Optimization", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compare DynamoDB, spot Spark, and feature pruning to cut the $120K/month feature store spend by 40% without increasing latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4206", "title": "Feature Serving Architecture for Multi-Model Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the feature serving layer to fetch 800 unique features for 30 models within a 10 ms p99 budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4207", "title": "Training-Serving Skew Detection", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes this massive training-serving skew despite a shared feature store, and how can it be prevented?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4208", "title": "Feature Store Backfill Strategy", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you backfill the feature store efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4209", "title": "Double-Buffered Feature Prefetch for A100 Inference", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a prefetch architecture to overlap the 8ms feature fetch with 3ms GPU inference and raise utilization above 80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4210", "title": "Feature Store Entity Key Design", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What entity key do you use, and what are the storage implications compared to user-level features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4211", "title": "Feature Store Monitoring and Drift Detection", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor drift across 800 hourly features without adding 45 minutes to the 15-minute pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4212", "title": "Migrating Microservice Features to a Centralized Feature Store", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you execute this migration without disrupting production?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4213", "title": "Feature Store Consistency Guarantees", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the dual-write feature store diverge, and what architecture provides eventual consistency with bounded staleness?", "chain_ids": ["cloud-chain-auto-020-13"], "chain_positions": {"cloud-chain-auto-020-13": 3}, "chain_tiers": {"cloud-chain-auto-020-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4214", "title": "Embedding Feature Serving at Scale", "topic": "feature-store-management", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can Redis handle this workload, and what are the alternatives if not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4215", "title": "Parquet vs TFRecord for Training Throughput", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which format, Parquet or TFRecord, should you use for the 5 TB image dataset to sustain 10 GB/s on 8× A100s, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4216", "title": "Columnar vs Row Format for Feature Tables", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much I/O does a columnar format save versus a row format, and what enables this savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4217", "title": "WebDataset for Distributed Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does WebDataset solve this, and what shard size do you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4218", "title": "Compression Tradeoffs for ML Data Pipelines", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which option (Snappy, Zstd, or uncompressed) best balances cost and performance for 5 epochs over the 10 TB Parquet dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4219", "title": "Storage Format for Streaming vs Random Access", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What storage format property causes this extreme random access latency, and how do you fix it without duplicating the dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4220", "title": "Parquet Row Group Sizing for ML Training", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the Parquet I/O bottleneck, and how should you fix it?", "chain_ids": ["cloud-chain-auto-secondary-007-02"], "chain_positions": {"cloud-chain-auto-secondary-007-02": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4221", "title": "Delta Lake vs Parquet for ML Versioning", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What properties does Delta Lake add over raw Parquet to solve these issues, and what are the performance costs?", "chain_ids": ["cloud-chain-auto-secondary-007-02"], "chain_positions": {"cloud-chain-auto-secondary-007-02": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4222", "title": "Storage Format Selection for Multimodal Datasets", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compare separate modality files, WebDataset tar shards, and Lance for the 80 TB multimodal dataset on throughput, storage, and ops complexity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4223", "title": "Tokenized Dataset Storage Format", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which token storage format—Parquet, raw memmap, or Arrow IPC—best optimizes 200B-token LLM training on 8x H100s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4224", "title": "Storage Format Impact on Shuffle Performance", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare chunk-based, index-based, and streaming shuffle for a 4 TB dataset; which minimizes time-to-first-batch while maintaining statistical quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4225", "title": "Petabyte-Scale Format Migration Strategy", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What format choices, migration strategy, and rollback plan would you use to standardize 5 PB across 200 ML datasets within 6 months?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4226", "title": "Storage Format for Streaming Inference Logs", "topic": "storage-format-selection", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a low-cost storage pipeline for 500K LLM inference logs per second at 3.5 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4227", "title": "DP-SGD Training Cost on H100", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much slower will DP-SGD be than 4-hour standard SGD for BERT-base on 8× H100s, and what is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4229", "title": "Privacy-Utility Tradeoff in LLM Fine-Tuning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is full DP-SGD viable for 1.5B-parameter medical NER at ε=1, and what alternatives improve privacy-utility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4231", "title": "DP Composition Across Multiple Model Releases", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What total privacy loss do basic, advanced, and RDP composition give after 12 ε=3 releases, and when must releases stop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4232", "title": "PII Detection and Scrubbing Pipeline", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With 500M documents averaging 2 PII instances each, how many PII instances leak through, and what is the defense-in-depth strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4233", "title": "DP Diffusion for Private Medical X-Ray Synthesis", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What epsilon provides useful synthetic data, and how do you validate that the synthetic images are both private and medically useful?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4234", "title": "GDPR Right to Erasure with DP Training", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Does DP-SGD at ε=5 satisfy GDPR erasure, and if not, what does retraining cost and what unlearning alternatives exist?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4235", "title": "DP-SGD Hyperparameter Tuning Without Spending Privacy Budget", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you tune hyperparameters without exhausting the privacy budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4236", "title": "Estimating activation memory for backward pass on H100 with large batch", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Do the batch-32, seq-2048 activations for a 7B BF16 transformer fit in 80 GB HBM without checkpointing?", "chain_ids": ["cloud-chain-auto-secondary-007-18"], "chain_positions": {"cloud-chain-auto-secondary-007-18": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4237", "title": "Debugging a graph break in torch.compile during transformer training", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why is torch.compile only giving an 8% speedup, and how would you fix the 47 graph breaks?", "chain_ids": ["cloud-chain-auto-secondary-007-16"], "chain_positions": {"cloud-chain-auto-secondary-007-16": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4238", "title": "Stable fused softmax cross-entropy on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement a numerically stable forward and backward for this fused cross-entropy autograd Function on MI300X?", "chain_ids": ["cloud-chain-auto-secondary-007-17"], "chain_positions": {"cloud-chain-auto-secondary-007-17": 1}, "chain_tiers": {"cloud-chain-auto-secondary-007-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4239", "title": "Understanding why torch.compile retraces after dynamic batch size changes", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What causes the 3-5 second torch.compile stalls with variable padded batch sizes, and how would you eliminate them?", "chain_ids": ["cloud-chain-auto-secondary-007-16"], "chain_positions": {"cloud-chain-auto-secondary-007-16": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4240", "title": "Quantifying activation checkpointing trade-off for LLM pretraining on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much memory does checkpointing every other transformer block save, and what compute overhead does it add on MI300X?", "chain_ids": ["cloud-chain-auto-secondary-007-18"], "chain_positions": {"cloud-chain-auto-secondary-007-18": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4241", "title": "Designing a custom autograd function for a differentiable rendering operation", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you wrap the custom H100 ray-marching CUDA kernel so gradients flow through PyTorch autograd?", "chain_ids": ["cloud-chain-auto-secondary-007-17"], "chain_positions": {"cloud-chain-auto-secondary-007-17": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4242", "title": "Tracing vs. capture: choosing between torch.jit.trace and torch.compile for production", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which PyTorch graph/deployment path would you use for the static BERT transformer and data-dependent post-processing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4243", "title": "Bounded K-FAC graph memory for ResNet-152 on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compute K-FAC second-order gradients for ResNet-152 on MI300X without unbounded graph memory growth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4244", "title": "Profiling autograd overhead in a training loop to identify bottlenecks", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and reduce the 280 ms backward time in this dynamic GNN on H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4245", "title": "Differentiating through a sort operation for learning-to-rank on TPU", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you make the NDCG-style ranking loss differentiable so gradients flow back through item scores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4246", "title": "Managing gradient accumulation with mixed precision and autocast on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should you structure BF16 autocast and gradient accumulation over 8 micro-batches on MI300X to avoid NaNs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4247", "title": "Understanding the autograd graph lifecycle and preventing memory leaks", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the most likely cause of the ~200 MB per-step GPU memory growth, and how would you fix it?", "chain_ids": ["cloud-chain-auto-secondary-007-18"], "chain_positions": {"cloud-chain-auto-secondary-007-18": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4248", "title": "Implementing gradient checkpointing for a custom attention mechanism", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you checkpoint the 100K-token chunked attention so backward does not recompute shared KV projections?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4249", "title": "Debugging torch.compile MoE routing graph breaks on H100", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you inspect and fix the torch.compile graph breaks or missed fusion in the H100 MoE routing path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4250", "title": "Handling in-place operations and their effect on the autograd graph", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does in-place ReLU intermittently break autograd here, and what should the team change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4251", "title": "Optimizing backward pass memory for a contrastive learning model on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compute the 4096×4096 CLIP contrastive loss without materializing O(N²) gradients and OOMing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4252", "title": "Diagnosing NaN gradients in a deep network with custom autograd operations", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you stabilize and debug the custom SDF backward that divides by |∇f|?", "chain_ids": ["cloud-chain-auto-secondary-007-17"], "chain_positions": {"cloud-chain-auto-secondary-007-17": 0}, "chain_tiers": {"cloud-chain-auto-secondary-007-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4253", "title": "Comparing forward-mode vs reverse-mode AD for Jacobian computation on H100", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For a 10-output, 3-input PINN Jacobian on H100, should you use jacfwd or jacrev, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4254", "title": "CUDA graphs for GPT-2 XL batch-1 decoding", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you capture CUDA graphs for GPT-2 XL batch-1 autoregressive decoding with a growing KV cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4255", "title": "Gradient accumulation correctness with DDP and autograd on MI300X", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should DDP synchronize 4 micro-batches, and how much AllReduce traffic is avoided per optimizer step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4256", "title": "Designing a differentiable data augmentation pipeline for training on H100", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement random crop, color jitter, and cutout so the GAN discriminator loss backpropagates to the generator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4257", "title": "Selective Gradient Computation for Fine-Tuning", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you freeze the first 20 BERT-large layers to reduce memory while fine-tuning only the last 4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4258", "title": "Implementing double backward for meta-learning with MAML on H100", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement MAML second-order inner-loop gradients without the 8x slowdown and 40 GB memory blowup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4259", "title": "Understanding graph break costs in torch.compile for a custom CUDA extension", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you make the custom attention CUDA op traceable by torch.compile without graph breaks?", "chain_ids": ["cloud-chain-auto-secondary-007-16"], "chain_positions": {"cloud-chain-auto-secondary-007-16": 2}, "chain_tiers": {"cloud-chain-auto-secondary-007-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4260", "title": "Autograd correctness testing with gradcheck and gradgradcheck", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you verify that the differentiable beam search decoder's custom backward pass is analytically correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4261", "title": "MI300X XCD Count and Die Topology", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the actual MI300X chiplet topology, and why is the 192 GB HBM pool not a monolithic die?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4262", "title": "Chiplet Die-to-Die Bandwidth vs HBM Bandwidth", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What chiplet-level bottleneck should you investigate first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4263", "title": "Yield-Performance Tradeoff in Chiplet Design", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which design should you recommend after comparing monolithic versus 4-chiplet yield and performance?", "chain_ids": ["cloud-chain-auto-secondary-005-04"], "chain_positions": {"cloud-chain-auto-secondary-005-04": 0}, "chain_tiers": {"cloud-chain-auto-secondary-005-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4264", "title": "NUMA Effects in MI300X Multi-XCD Workloads", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix NUMA-related MFU loss on MI300X versus H100?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4265", "title": "Coherency Domains Across XCDs", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What coherency constraints must you enforce across XCDs when sharing intermediate reduction buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4266", "title": "MI300X Unified Memory Versus Explicit Copies", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the performance implications of unified memory on MI300X compared with explicit copies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4267", "title": "Interposer Bandwidth Scaling Laws", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does aggregate bandwidth scale sublinearly with XCD count in a 4-XCD chiplet system, and what scaling should you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4268", "title": "Chiplet vs Monolithic Roofline Model", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does the NUMA bandwidth hierarchy of the MI300X affect kernel tuning versus a monolithic H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4269", "title": "Die-to-Die Link Latency vs NVLink Comparison", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is MI300X intra-package Infinity Fabric equivalent to NVLink 4.0 for collectives, quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4270", "title": "Model Parallelism Partitioning for XCD Locality", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition the 70B model across the 8 XCDs to maximize local memory access?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4271", "title": "MI300X XCD to HBM Proximity and Memory Symmetry", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How are the 4 Memory Cache Dies (MCDs) in MI300X physically connected to the 8 XCDs, and what does this mean for memory access symmetry?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4272", "title": "Chiplet Architecture and Multi-Tenant Isolation", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What architectural challenges arise from XCD-level multi-tenancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4273", "title": "MI300X XCD Thermal Throttling from Imbalanced Inference Workloads", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing non-uniform XCD thermal throttling on MI300X during sustained inference, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4274", "title": "Infinity Fabric Topology and All-Reduce Efficiency", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How does the physical topology constrain the logical ring order?", "chain_ids": ["cloud-chain-auto-secondary-005-03"], "chain_positions": {"cloud-chain-auto-secondary-005-03": 3}, "chain_tiers": {"cloud-chain-auto-secondary-005-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4275", "title": "Cross-Die Prefetching for LLM Decode", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you prefetch remote-XCD KV-cache blocks to hide Infinity Fabric latency during MI300X autoregressive decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4276", "title": "Adapter Serving Infrastructure: S-LoRA Paged Memory Design", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design S-LoRA-style paged adapter memory for 1000 LoRA adapters with only 50 active, and what is the overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4277", "title": "H100 Memory Budget for LoRA vs Full Fine-Tuning of a 7B Model", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which approach fits on a single 80 GB H100: rank-16 LoRA or full fine-tuning of a 7B model?", "chain_ids": ["cloud-chain-auto-secondary-004-26"], "chain_positions": {"cloud-chain-auto-secondary-004-26": 0}, "chain_tiers": {"cloud-chain-auto-secondary-004-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4278", "title": "QLoRA 4-bit Quantization Memory Arithmetic", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does QLoRA fit 70B fine-tuning on two H100s, and what is the actual GPU memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4279", "title": "Multi-Adapter Batching and Shared Base Model", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you lay out memory and batch requests for 50 rank-16 LoRA adapters sharing one 13B base model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4280", "title": "RLHF Infrastructure: PPO Training Pipeline Architecture", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What four models are needed for PPO RLHF on a 7B policy, and how would you schedule them across 8 GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4281", "title": "Reward Hacking Detection in RLHF Systems", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you diagnose the reward hacking after 3000 PPO steps and what infrastructure-level mitigations would you implement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4282", "title": "Serving Tradeoffs for 100 LoRA Adapters", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "When should you choose merge, dynamic loading, or all-in-memory for 100 LoRA adapters across latency, memory, and operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4283", "title": "PEFT Memory Budget Across Parallelism Strategies", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the per-GPU memory breakdown for tensor-parallel LoRA training on 8 GPUs, and what is the binding constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4284", "title": "Adapter-Aware KV Cache Management", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why can't PagedAttention KV cache pages be shared across different LoRA adapters, and how much memory overhead does this create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4285", "title": "RLHF vs RLAIF Infrastructure Cost Comparison", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the RLHF vs RLAIF cost difference for 10K and 100K preference comparisons?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4286", "title": "Personalization at Scale: Per-User LoRA vs Prompt Engineering", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "At what scale does each approach break down when comparing per-user LoRA adapters vs long system prompt personalization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4287", "title": "LoRA Rank Collapse Under Large Gradient Accumulation", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the LoRA loss instability and gradient explosions after 500 steps, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4288", "title": "LoRA Rank Sensitivity Analysis", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you select the optimal LoRA rank for the 34B code model, and why do returns diminish beyond rank-64?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4289", "title": "Federated LoRA Training for Privacy-Preserving Adaptation", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design federated LoRA training across 50 hospitals with differential privacy guarantees?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4290", "title": "Mixture of LoRA Experts (MoLoRA) Architecture", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design token routing to 8 specialized LoRA adapters, and what are the compute and memory overheads?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4291", "title": "Prefill-Decode Split Rationale", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does disaggregated serving split prefill and decode onto separate GPU pools, and what hardware bottleneck makes sharing inefficient?", "chain_ids": ["cloud-chain-auto-020-04"], "chain_positions": {"cloud-chain-auto-020-04": 0}, "chain_tiers": {"cloud-chain-auto-020-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4292", "title": "KV-Cache Transfer Bandwidth Budget", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the KV-cache transfer size and latency over 400 Gbps InfiniBand, and is it on the critical path for TTFT?", "chain_ids": ["cloud-chain-auto-020-05"], "chain_positions": {"cloud-chain-auto-020-05": 0}, "chain_tiers": {"cloud-chain-auto-020-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4293", "title": "TTFT vs TPOT SLO Tension", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What does moving two decode GPUs to prefill fix and break, and what metric determines the right pool ratio?", "chain_ids": ["cloud-chain-auto-020-04"], "chain_positions": {"cloud-chain-auto-020-04": 1}, "chain_tiers": {"cloud-chain-auto-020-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4294", "title": "Prefill Stall During Decode Migration", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What causes intermittent 4-second P99 TTFT spikes when decode GPUs have high memory utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4295", "title": "Chunked Prefill Scheduling Window", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What chunk scheduling policy would keep TTFT under 200 ms for inputs up to 32k tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4296", "title": "80GB vs 192GB GPU Decode Pool Sizing", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a 70B FP16 decode pool, how do the two GPUs compare on concurrency and token throughput, and which is better?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4297", "title": "Decode Pool Autoscaling Lag", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What leading indicator should trigger decode autoscaling before TPOT degrades?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4298", "title": "Cross-Pool KV Compression Tradeoff", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you compress the KV-cache before prefill-to-decode transfer, and under what conditions is the 2x transfer reduction net-positive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4299", "title": "Speculative Decoding in Disaggregated Architecture", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Where should the 7B draft model run in disaggregated speculative decoding, how should KV caches interact, and what failure modes arise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4300", "title": "Prefill-Decode Affinity and Context Caching", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where should prefix KV-caches live in a disaggregated architecture, how should they be invalidated, and what TTFT savings come from a 60% hit rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4301", "title": "Decode Preemption and KV Swap", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do resumed preempted sequences have 8x TPOT for the first 20 tokens, and how would you fix it?", "chain_ids": ["cloud-chain-auto-020-06"], "chain_positions": {"cloud-chain-auto-020-06": 0}, "chain_tiers": {"cloud-chain-auto-020-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4302", "title": "Optimal Prefill Batch Size for TTFT", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At what sequence length and batch size does the prefill become compute-bound versus memory-bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4303", "title": "Network Topology for KV Transfer at Scale", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "At 500 req/s transferring 384 MB of KV-cache each, is the 400 Gbps 2:1 oversubscribed InfiniBand network a bottleneck, and how would you mitigate it?", "chain_ids": ["cloud-chain-auto-020-05"], "chain_positions": {"cloud-chain-auto-020-05": 1}, "chain_tiers": {"cloud-chain-auto-020-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4304", "title": "Variable Output Length and Decode Pool Imbalance", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does round-robin assignment create such extreme decode memory imbalance, and how would you redesign the system to rebalance work?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4305", "title": "TPU v5e as Prefill Accelerator", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is TPU v5e a better prefill accelerator than H100 for a 7B BF16 model, and what practical constraints make the TPU-GPU setup painful?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4306", "title": "Length Buckets for 50-8192 Token Prefill", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you batch 50-8192-token prefill requests on 8 GPUs to maximize throughput while keeping TTFT variation within 3x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4307", "title": "Graceful Decode Node Failure During Generation", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you recover 200 in-flight sequences after a decode GPU failure, and what state should be checkpointed at what frequency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4308", "title": "Continuous vs. Static Batching in Decode Pool", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does continuous batching improve decode utilization for asynchronous prefill arrivals, and what scheduling event enables it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4309", "title": "Multi-Model Disaggregated Serving Isolation", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect disaggregated serving for 5 LLMs and thousands of tenants while isolating KV caches and capacity across SLOs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4310", "title": "Prefill Throughput Saturation Diagnosis", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the 4-GPU prefill pool stuck at 150k tokens/s with 35% GPU compute and 90% PCIe utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4311", "title": "KV-Cache Radix Tree for Prefix Sharing", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much memory does radix-tree prefix sharing save, and how many concurrent requests fit with versus without sharing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4312", "title": "Tail Latency of Decode Under Mixed Workloads", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 8× P99/P50 TPOT tail on H100 with 30% long outputs, and what priority scheduling intervention would reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4313", "title": "Disaggregation vs. Colocation Break-Even Analysis", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "When does disaggregated serving outperform colocation for the 13B mixed workload on 32 H100s after KV transfer overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4314", "title": "Decode Throughput Scaling with Tensor Parallelism", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do TP=2 H100s and TP=1 MI300X compare for 70B decode latency and throughput, and when does AllReduce become the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4315", "title": "Disaggregated Serving Cost Model", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What H100 prefill to MI300X decode pool ratio minimizes cost at 10,000 req/hr while meeting 200 ms TTFT and 50 ms TPOT?", "chain_ids": ["cloud-chain-auto-020-04"], "chain_positions": {"cloud-chain-auto-020-04": 2}, "chain_tiers": {"cloud-chain-auto-020-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4316", "title": "DLRM Architecture Overview", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the two distinct compute pathways, why are they architecturally separate, and what operation fuses their outputs?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 0}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4317", "title": "TB-Scale Embedding Table Sharding", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the sharding strategy to minimize P99 lookup latency, and what determines which tables go to GPU vs CPU?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 1}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4318", "title": "Multi-Stage Ranking Latency Budget", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you allocate the 150 ms SLO across the 4-stage recommendation pipeline, and which stages benefit most from GPU acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4319", "title": "Embedding Lookup All-to-All Communication", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does embedding all-to-all dominate DLRM training, and what architectural change would reduce communication volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4320", "title": "Online Learning Staleness and Embedding Drift", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is a 10-minute online embedding update cadence insufficient during 30-60 minute viral events despite the model updating on time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4321", "title": "Feature Interaction Layer Arithmetic Intensity", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For batch size 2048, what are the interaction-layer FLOPs and is it compute-bound or memory-bandwidth-bound on an A100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4322", "title": "Embedding Table Update Consistency in Distributed Training", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do high-frequency item embeddings have 3x higher loss, and what optimizer and update-protocol changes would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4323", "title": "Social-Scale Serving QPS and Caching Strategy", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a multi-tier embedding cache, and what hit rate can a 1 TB GPU HBM L2 cache achieve under Zipf α=0.8?", "chain_ids": ["cloud-chain-auto-021-15"], "chain_positions": {"cloud-chain-auto-021-15": 2}, "chain_tiers": {"cloud-chain-auto-021-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4324", "title": "Mixed-Precision Embedding Tables", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Compare FP16, INT8, and INT4 quantization for embeddings: what is the accuracy impact mechanism for each, what is the bandwidth reduction, and which is preferred for the hot/cold split?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4326", "title": "Two-Tower Model vs. DLRM for Retrieval", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which is appropriate for retrieval vs. ranking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4327", "title": "Real-Time Feature Pipeline Latency", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architecture change would cut P99 feature assembly from 45 ms to under 15 ms for the 50 real-time features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4328", "title": "Gradient Accumulation for Sparse Embedding Training", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do sparse gradients OOM during 16-step accumulation on 192 GB GPUs, and how would you fix it without reducing batch or accumulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4329", "title": "Cold Start for New Item Embeddings", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the cold start problem, and what three methods can initialize a new item's DLRM embedding before organic training data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4330", "title": "Model Parallelism Strategy for 100TB Embedding Cluster", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid training strategy and throughput would you expect for 100 TB embeddings and a 10B-parameter MLP on 512 H100s plus CPU nodes?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 3}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4331", "title": "Session-Based Recommendation Temporal Features", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is wrong with concatenating 50 item embeddings into a 3200-dim DLRM input, and what session encoder would you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4332", "title": "Serving Latency vs. Model Freshness Tradeoff", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 100k QPS, how do synchronous online updates compare with 5-minute shadow updates for CTR staleness and serving latency?", "chain_ids": ["cloud-chain-auto-021-16"], "chain_positions": {"cloud-chain-auto-021-16": 2}, "chain_tiers": {"cloud-chain-auto-021-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4333", "title": "Recall @100 vs. NDCG Tradeoff in Retrieval Optimization", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can Recall @100 improve from 72% to 81% while end-to-end NDCG @10 drops from 0.38 to 0.35?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4334", "title": "Embedding Dimension Selection and Capacity", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum item embedding dimension fits in 512 GB for 100M FP32 items, and what dimension would you choose using intrinsic dimensionality?", "chain_ids": ["cloud-chain-auto-021-17"], "chain_positions": {"cloud-chain-auto-021-17": 0}, "chain_tiers": {"cloud-chain-auto-021-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4335", "title": "Request Deduplication and Result Caching in Rec Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a result caching layer for 30-second re-requests, and what GPU load reduction is expected?", "chain_ids": ["cloud-chain-auto-021-15"], "chain_positions": {"cloud-chain-auto-021-15": 1}, "chain_tiers": {"cloud-chain-auto-021-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4336", "title": "DLRM Training on TPU v5e Pod", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which DLRM components map well to a TPU v5e pod, which do not, and what architectural change makes the model TPU-trainable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4337", "title": "Learned Positional Embeddings for Sequence Modeling", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why do positional embeddings improve session next-item prediction, and when can positional information hurt generalization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4338", "title": "Embedding Table Hot Row Replication for Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you replicate the top-10k hot embedding rows across 16 A100 serving nodes, and what read-throughput improvement would that give?", "chain_ids": ["cloud-chain-auto-021-17"], "chain_positions": {"cloud-chain-auto-021-17": 2}, "chain_tiers": {"cloud-chain-auto-021-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4339", "title": "Diversity vs Relevance Tradeoff in Re-ranking", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What system-level mechanism caused the 40% diversity collapse despite +3% CTR, and what re-ranking intervention would recover diversity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4340", "title": "Full-Stack RecSys Architecture for a New Social Platform", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What end-to-end recommendation architecture would you build for 100M users at 500k QPS, and how would it evolve to 1B users at 5M QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4341", "title": "DP-SGD Epsilon Budget Exhaustion at Scale", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are your architectural options to extend the remaining budget, and what are the quantitative tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4342", "title": "Privacy Amplification via Subsampling in Production", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does this claim hold, and what are the implementation pitfalls in a distributed multi-GPU training context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4343", "title": "Federated DP with Heterogeneous Client Noise", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why isn't the DP-FedAvg privacy bound tighter with 10,000 enrolled clients but only about 200 participants per round?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4344", "title": "Membership Inference Attack Resistance Under DP", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is a 61% MIA AUC theoretically possible under an ε=4 DP guarantee, and what vulnerabilities allow it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4345", "title": "Intersectional Fairness Under Distribution Shift", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What caused the subgroup approval drop, and how would you detect and fix intersectional drift?", "chain_ids": ["cloud-chain-auto-secondary-014-31"], "chain_positions": {"cloud-chain-auto-secondary-014-31": 3}, "chain_tiers": {"cloud-chain-auto-secondary-014-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4348", "title": "Multi-Metric Fairness Dashboard for Production LLM", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you build a fairness monitoring pipeline for 500K daily job-description requests, and what compute cost would you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4349", "title": "Model Card Infrastructure for 50-Model Production Fleet", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a system that auto-generates and maintains living model cards for 50 production models under EU AI Act requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4350", "title": "Red-Teaming Throughput vs Coverage Tradeoff", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the 500 person-hour red-team budget be allocated across free-form, taxonomy-guided, and automated adversarial testing?", "chain_ids": ["cloud-chain-auto-secondary-015-37"], "chain_positions": {"cloud-chain-auto-secondary-015-37": 0}, "chain_tiers": {"cloud-chain-auto-secondary-015-37": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4351", "title": "RLHF Reward Hacking and Constitutional AI Safeguards", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you fix the RLHF reward-hacking pattern at the training, inference, and monitoring layers?", "chain_ids": ["cloud-chain-auto-secondary-015-37"], "chain_positions": {"cloud-chain-auto-secondary-015-37": 1}, "chain_tiers": {"cloud-chain-auto-secondary-015-37": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4352", "title": "Operational vs Embodied Carbon in a 1,000-GPU Training Cluster", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For one training run plus allocated embodied carbon, which option has lower lifecycle emissions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4353", "title": "Carbon-Aware Training Job Scheduling", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule a continuous 12-hour, 200-GPU training job within a 48-hour window to minimize carbon while meeting the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4354", "title": "Lifecycle Carbon Analysis of a 3B Model Serving System", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the lifecycle carbon per inference for the 3B LLM over 2 years, and which component dominates emissions?", "chain_ids": ["cloud-chain-auto-020-07"], "chain_positions": {"cloud-chain-auto-020-07": 4}, "chain_tiers": {"cloud-chain-auto-020-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4355", "title": "Carbon Cost Per Inference: INT8 vs FP16 Serving Comparison", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the carbon per 1M tokens for FP16 versus INT8 13B inference, and how much annual carbon does INT8 save at 50B tokens/day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4356", "title": "Spatial Carbon Arbitrage: Multi-Region Training Routing", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do the carbon and performance tradeoffs compare for all US-East, all EU-North, and split US-West/EU-North routing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4357", "title": "Carbon-Aware Autoscaling for Inference Endpoints", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign spot-instance autoscaling to use real-time carbon intensity while meeting the 6-hour SLA for 99% of transcription jobs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4358", "title": "SustainabilityML: Reporting Carbon per Training Run", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the IT energy, facility energy, market-based emissions, and location-based emissions (assuming a regional grid of 350 gCO2e/kWh) for the training run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4359", "title": "MI300X Roofline vs H100: Memory-Bound vs Compute-Bound Boundary", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the MI300X and H100 roofline ridge points, and which has higher LLaMA-2 70B decode throughput at AI=0.8 FLOP/byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4360", "title": "MI300X 192GB vs H100 80GB: Multi-Model Serving Density", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Can a single 192GB or 80GB accelerator host 8 replicas with 512 total requests, and what is the density advantage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4361", "title": "TPU v5e vs H100: Systolic Array Efficiency for Transformer Training", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At AI≈100 FLOP/byte for 7B transformer training, which accelerator delivers higher effective throughput and MFU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4362", "title": "MI300X XCD Bandwidth Locality", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does splitting the kernel across all 8 MI300X XCDs reduce bandwidth from 4.2 TB/s to 3.1 TB/s, and how should memory locality be handled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4363", "title": "Serving a 70B Model: MI300X PCIe TP=2 vs H100 NVLink TP=2", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 70B LLM at 200 req/s and p99 <200 ms, how do MI300X PCIe TP=2 and H100 NVLink TP=2 compare, and what should you deploy?", "chain_ids": ["cloud-chain-auto-001-19"], "chain_positions": {"cloud-chain-auto-001-19": 1}, "chain_tiers": {"cloud-chain-auto-001-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4364", "title": "TPU v5e ICI Topology vs H100 NVLink for Data Parallelism", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using ring AllReduce for a 12GB gradient on 256 accelerators, what are the theoretical times on TPU v5e ICI versus H100 over InfiniBand?", "chain_ids": ["cloud-chain-auto-002-09"], "chain_positions": {"cloud-chain-auto-002-09": 1}, "chain_tiers": {"cloud-chain-auto-002-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4365", "title": "MI300X INT4 Quantization: VRAM and Throughput vs H100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 70B INT4 GPTQ model, what token throughput should H100 and MI300X achieve, and does MI300X keep its bandwidth advantage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4366", "title": "TPU v5e BF16 Reductions and Training Stability", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the TPU v5e BF16 run 1.8% higher loss after 50K steps, and how would you fix the JAX training loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4367", "title": "MI300X Power Budget and Thermal Throttling Under Full Load", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is the MI300X thermally limited at 92°C and 742W under sustained GEMM, and how does its thermal headroom compare with H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4368", "title": "ROCm vs CUDA Ecosystem Overhead for MI300X Deployment", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much of the 85% MI300X vLLM throughput is a ROCm kernel gap versus hardware capability, and what is expected with FlashAttention on ROCm?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4369", "title": "MI300X MIG vs H100 MIG: Multi-Tenant Serving Partitioning", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you serve 14 tenant-isolated 7GB models on H100 MIG versus MI300X, and what MI300X multi-tenant design would you use?", "chain_ids": ["cloud-chain-auto-001-07"], "chain_positions": {"cloud-chain-auto-001-07": 2}, "chain_tiers": {"cloud-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4370", "title": "Mapping TP PP and EP onto a TPU v5e Torus", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you map TP=8, PP=16, and EP=16 for the 1T MoE model onto a 16×16×8 TPU torus, and how does it compare to H100?", "chain_ids": ["cloud-chain-auto-002-09"], "chain_positions": {"cloud-chain-auto-002-09": 2}, "chain_tiers": {"cloud-chain-auto-002-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4371", "title": "Cold KV Cache Offload Latency", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming 1GB cold KV fetches, how do PCIe offload and unified memory affect p99 latency for this 180GB serving setup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4372", "title": "TPU v5e vs H100 for a 7B LoRA Fine-Tune", "topic": "quantization-fundamentals", "competency_area": "optimization", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which configuration finishes faster, and what is the per-dollar cost comparison assuming 1.2/chip-hr for TPU v5e and 3.5/GPU-hr for H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4373", "title": "MI300X vs A100: VRAM Capacity Advantage for Long-Context Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "For the 13B model at 128K context, how large is one request's KV cache, and how many requests fit on 80GB vs 192GB GPUs?", "chain_ids": ["cloud-chain-auto-008-19"], "chain_positions": {"cloud-chain-auto-008-19": 1}, "chain_tiers": {"cloud-chain-auto-008-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4374", "title": "MI300X Prefill Throughput: Compute-Bound vs H100 for Long Prompts", "topic": "roofline-analysis", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which accelerator is compute-bound for prefill at this AI, and what are the expected FLOP efficiencies between A100, H100, and MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4375", "title": "Small Chips vs Large GPUs: Serving Cost Per Token at Scale", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For serving the 7B LLM at 100B tokens/day, how many small chips or large GPUs are needed and what is the cost per 1M tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4376", "title": "MI300X AllReduce Performance in 8-GPU Data Parallel Training vs H100", "topic": "collective-communication", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 28GB gradient AllReduce on 8 GPUs, what ring-AllReduce time do MI300X xGMI and H100 NVLink achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4377", "title": "MI300X for Mixture-of-Experts: Expert Capacity and Memory Layout", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What serving configuration should you use for Mixtral 8×7B on H100 versus MI300X, and what decode throughput should each deliver?", "chain_ids": ["cloud-chain-auto-001-20"], "chain_positions": {"cloud-chain-auto-001-20": 1}, "chain_tiers": {"cloud-chain-auto-001-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4378", "title": "Vendor Lock-In Analysis: TPU v5e vs MI300X vs H100 for a 5-Year Infrastructure Plan", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which accelerator should be the primary platform for a 5-year 7B–100B training and serving commitment considering software ecosystem, hardware upgrades, and TCO, and what is your hedge strategy?", "chain_ids": ["cloud-chain-auto-001-19"], "chain_positions": {"cloud-chain-auto-001-19": 2}, "chain_tiers": {"cloud-chain-auto-001-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4379", "title": "Disparate Impact Testing for LLM Embedding Spaces", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "cloud", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "If you select the top 20% of 10,000 candidates, what are the Group A and B selection rates and does the embedding system violate the 4/5ths rule?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4380", "title": "MI300X HBM3 Bandwidth as a Power Efficiency Lever", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the MI300X and H100 SXM5 peak HBM bandwidths, and how does higher bandwidth affect accelerator count and rack power for inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4381", "title": "PUE Decomposition for Dense H100 Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For the 1,000-node cluster at PUE 1.4, what is the total facility power, non-IT overhead, and which PUE components dominate compared to traditional compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4382", "title": "Stranded Power in Mixed H100/A100 Rack Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which whole H100 and A100 nodes should fill one 25 kW rack to maximize GPUs, and how much power is stranded?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4383", "title": "Thermal Throttling Cascade in a Dense H100 Pod", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing Pods 7-12 to drop from 700W to 500W with 38°C hot aisles, and what immediate and long-term remediation should you take?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4384", "title": "Power Capping Strategy for MI300X Training Under Budget Constraints", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do static 590W caps, 25% batch-size reduction, and dynamic power capping affect throughput and energy efficiency under the 300 kW cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4385", "title": "Waste Heat Recovery Feasibility for H100 Liquid-Cooled Racks", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can 55°C D2C return coolant from 500 H100 nodes be used for 60°C district heating, what options bridge the gap, and how much heat is recoverable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4386", "title": "GPU Power State Transitions and Idle Power Optimization", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which H100 power-management mechanisms reduce idle power, and what savings are possible for 200 GPUs at 40% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4387", "title": "MI300X vs H100: Energy-per-Token at Scale for LLM Inference", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should you compare MI300X and H100 energy per token for 100B tokens/day of Llama-3 70B FP16 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4388", "title": "Cluster-Level H100 DVFS Power Capping During AllReduce", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What cluster-level DVFS policy keeps the 4,096-node cluster under 25 MW while exploiting GEMM and all-reduce phase differences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4389", "title": "Power Distribution Unit Over-Subscription in GPU Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a 30 kW continuous, 36 kW peak PDU handle three 8 kW H100 nodes ramping simultaneously, and what soft-start policy should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4390", "title": "Computational Carbon Intensity of MI300X Fine-Tuning Runs", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the carbon footprint of the 64×MI300X fine-tuning job in the Pacific Northwest versus the Southeast, and how should it be reported?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4391", "title": "Cooling Fluid Routing for Hot-Aisle Containment in H100 Deployments", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does hot-aisle containment improve PUE for 8 kW nodes, what thermodynamic mechanism matters, and what PUE gain is typical?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4392", "title": "Server Power Supply Efficiency Curves and 80 PLUS Certification Impact", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the 80 PLUS Gold, Platinum, and Titanium efficiencies at 50% load, and how much does Titanium save versus Gold for one 8×H100 server drawing 8 kW IT load at 50% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4393", "title": "Workload Consolidation to Improve GPU Utilization and Power Efficiency", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much power would consolidating 1,000 underutilized GPUs save, what risks does it create, and how would MIG or time-slicing improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4394", "title": "Transformer Inference Memory Bandwidth Saturation and Power Envelope", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is energy per token lower at batch 64, and how should batching be chosen under a 200 ms TTFT SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4395", "title": "Rack-Level UPS Sizing and Power Factor Correction", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What UPS capacity is needed, why does kVA exceed kW, and what does power factor correction save?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4397", "title": "MI300X OAM Module Thermal Interface and Cooling Validation", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using HBM as the limiting sensor, what monitoring policy prevents throttling and what safe utilization follows from the burn-in data?", "chain_ids": ["cloud-chain-auto-021-03"], "chain_positions": {"cloud-chain-auto-021-03": 2}, "chain_tiers": {"cloud-chain-auto-021-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4398", "title": "Power Anomaly Detection for H100 Training Clusters", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might h100-rack07-node04 sustain 820W per GPU without errors, is it a fault, and what power thresholds should trigger escalation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4399", "title": "Fleet-Wide GPU Energy Efficiency Benchmarking with Performance-per-Watt", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a single energy-efficiency score for a heterogeneous inference fleet, and what would you recommend?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4400", "title": "MI300X Unified Memory Architecture and Its Thermal Implications", "topic": "datacenter-efficiency", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why did the MI300X serving process jump to 5.1 TB/s bandwidth and 740W, what are the thermal consequences, and what workload change caused it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4401", "title": "Carbon-Aware Inference Routing Across Multi-Region H100 Fleets", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What carbon-aware routing policy would you use for the 18,000 q/h service, and how much Scope 2 carbon would it save under a 200 ms p99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4402", "title": "Scope 3 Embodied Carbon Accounting for MI300X Cluster Procurement", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For the 512-GPU cluster, how do Scope 2 and Scope 3 lifecycle emissions compare, and when does embodied carbon exceed operational carbon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4404", "title": "H100 Inference SLA Degradation Under Noisy Neighbor CPU Contention", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the p99 latency spike to 380 ms with unchanged p50 and GPU utilization, and how would you mitigate it immediately and systemically?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4405", "title": "Disaggregated Prefill-Decode Architecture for Tail Latency Control", "topic": "tail-latency", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design disaggregated prefill/decode to meet p99 TTFT under 500ms and p99 generation under 4s including KV transfer?", "chain_ids": ["cloud-chain-auto-025-17"], "chain_positions": {"cloud-chain-auto-025-17": 2}, "chain_tiers": {"cloud-chain-auto-025-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4406", "title": "InfiniBand AllReduce Retransmits in MI300X Rings", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 512-GPU ring AllReduce only achieving 140 Gbps with 35% RDMA retransmits, and what network tuning would you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4407", "title": "RDMA Queue Pair Limits and Scalability in Large H100 Pods", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "At 512-1,024 nodes, why does NCCL initialization hang from RDMA QP exhaustion, and what scaling limits and architectural fixes would you propose?", "chain_ids": ["cloud-chain-auto-020-03"], "chain_positions": {"cloud-chain-auto-020-03": 0}, "chain_tiers": {"cloud-chain-auto-020-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4408", "title": "RDMA Memory Registration Overhead in Dynamic Batch LLM Training", "topic": "rdma-transport", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does dynamic tensor allocation add 310ms of ibv_reg_mr overhead per training step, and how would you redesign RDMA memory management?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4409", "title": "UCIe Bandwidth Scaling for Multi-Chiplet AI Accelerators", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What UCIe bandwidth constraint prevents 4 TB/s compute-to-HBM bandwidth, and what chiplet floorplan would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4410", "title": "MI300X NUMA-Aware Tensor Parallel Rank Placement", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you fix the rank placement to resolve the 18% all-reduce latency penalty on MI300X?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4411", "title": "Chiplet Yield Model and Cost per Compute Die", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the 600 mm² monolithic die versus six 100 mm² chiplets, what are the yield and cost differences, and where is the crossover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4412", "title": "HBM3E vs HBM4 for a 24-Month Training Accelerator Roadmap", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the 24-month training accelerator roadmap choose HBM3E now or wait for HBM4, given the bandwidth-compute balance and schedule risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4413", "title": "Coherency Protocol Overhead Across XCD Boundaries", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does XLA fusion slow down 40% when tensors are on different XCDs, and how would you fix the data placement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4414", "title": "Active Interposer vs Passive Silicon Interposer Tradeoffs", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use an active interposer or a passive silicon interposer for the next AI accelerator, and what full-system tradeoffs determine the choice?", "chain_ids": ["cloud-chain-auto-secondary-005-04"], "chain_positions": {"cloud-chain-auto-secondary-005-04": 2}, "chain_tiers": {"cloud-chain-auto-secondary-005-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4415", "title": "Chiplet-Based GPU Multi-Instance Partitioning", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design strong multi-tenant isolation for MI300X by mapping tenant slices onto its XCD topology?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4416", "title": "Infinity Fabric vs NVLink for Scale-Up Topology", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 70B LLM training with TP=8, how do 8x NVLink and 8x Infinity Fabric nodes compare on all-reduce and memory capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4417", "title": "XCD Hot-Spot Thermal Management Under Sustained Load", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do XCDs 0 and 1 throttle and reduce throughput by 12% despite uniform TDP and temperatures, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4418", "title": "Disaggregated Memory Architecture: CXL vs HBM on Chiplet", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For 70B LLM inference, what are the bandwidth and latency tradeoffs of replacing some HBM stacks with CXL 3.0 DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4420", "title": "Multi-Die Power Delivery Network Design for AI Chiplet", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the PDN for the 4-compute-die, 2-HBM chiplet package to reduce matrix-multiply voltage droop from 8% to under 3%?", "chain_ids": ["cloud-chain-auto-secondary-005-04"], "chain_positions": {"cloud-chain-auto-secondary-005-04": 1}, "chain_tiers": {"cloud-chain-auto-secondary-005-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4421", "title": "Chiplet Roofline Model for Mixed Precision LLM Training", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At 120 FLOP/byte arithmetic intensity, are MI300X and H100 BF16 transformer forward passes bandwidth-bound or compute-bound, and what throughput do you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4422", "title": "Chiplet Die-to-Die Latency Impact on Pipeline Bubble Rate", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If a forward pass crosses boundaries 3 times at 180 ns each, what causes the 7% throughput drop, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4423", "title": "Heterogeneous Chiplet Integration: DSP + AI Core on Same Package", "topic": "chiplet-architecture", "competency_area": "compute", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the data path and memory sharing between the DSP die and AI inference die for FFT-heavy DNN inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4424", "title": "LoRA Rank Sensitivity to Task Complexity", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does LoRA rank 4 underperform while rank 64 gives no gain over rank 16 for SQL generation, and which rank should you choose?", "chain_ids": ["cloud-chain-auto-secondary-004-26"], "chain_positions": {"cloud-chain-auto-secondary-004-26": 1}, "chain_tiers": {"cloud-chain-auto-secondary-004-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4425", "title": "QLoRA Double Quantization Memory Accounting", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory does QLoRA need to fine-tune a 70B model with NF4 and double quantization on one 80GB GPU, and is it feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4426", "title": "Multi-Adapter Hot-Swap Serving with S-LoRA", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you load and cache 500 LoRA adapters for the 4xH100 serving system to keep P99 TTFT below 100 ms at 200 QPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4427", "title": "RLHF Reward Model Overoptimization Detection", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why are reward scores rising while human ratings fall with KL at 12 nats after 3 PPO epochs, and how would you fix the reward hacking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4428", "title": "Adapter Composition: Sequential vs Parallel LoRA Merging", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For medical and formal-tone LoRA adapters applied together, should you use sequential composition, parallel merge, or task-vector arithmetic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4429", "title": "140B PEFT Comparison for 10K Enterprise Examples", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a 140B model and 10K examples, how do LoRA, (IA)^3, and prefix tuning compare on memory, performance, and training time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4430", "title": "LoRA Target Module Selection: Q/K/V/O vs All Linear Layers", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For Mistral 7B classification, how many LoRA parameters do Q+V-only versus all-linear targeting add, and what are the performance implications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4431", "title": "Reward Model Architecture for RLHF at Scale", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What reward model size, architecture, and serving strategy would you use to score PPO completions at 500 QPS on a 4x GPU budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4432", "title": "Continual Adaptation: Catastrophic Forgetting in LoRA Updates", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did original-task performance degrade 15% after four monthly LoRA adapter updates even though the base weights were frozen, and how would you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4433", "title": "PPO Clip Ratio Tuning for RLHF Stability", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is causing PPO reward collapses every 200 steps with high value loss and epsilon 0.2, and what tuning changes would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4434", "title": "Multi-Task LoRA Training with Gradient Conflict Resolution", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you diagnose and resolve gradient conflict when code generation loss rises after step 500 while summarization and QA improve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4435", "title": "DPO vs PPO Infrastructure Cost for Alignment at 70B Scale", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "With 32 H100s for 1 week and 200K preference pairs, should you choose DPO or PPO to align the 70B model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4436", "title": "Adapter Versioning and Rollback in Production ML Systems", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design adapter versioning and rollback so a bad LoRA adapter v23 can revert to v22 within 5 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4437", "title": "LoRA Adapter Distillation for Latency-Sensitive Serving", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you distill the r=16 domain LoRA adapter for a 70B model to meet a P95 TTFT target below 50 ms while preserving quality?", "chain_ids": ["cloud-chain-auto-secondary-004-26"], "chain_positions": {"cloud-chain-auto-secondary-004-26": 2}, "chain_tiers": {"cloud-chain-auto-secondary-004-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4438", "title": "LoRA Training Data Efficiency: Minimum Viable Dataset Size", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is 500 examples sufficient for LoRA fine-tuning, or is the colleague right about needing 10K?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4439", "title": "Prefill Pool GPU Count Formula for TTFT SLO", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many H100s should a conservative prefill pool include, with operational headroom?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4440", "title": "KV-Cache Transfer Compression to Reduce Network Bottleneck", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV-cache compression scheme would reduce the 3.2 ms prefill-to-decode transfer over 400 Gbps InfiniBand and lower TTFT overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4441", "title": "Decode Pool Scaling Policy Under Variable Output Length", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule the decode pool to prevent 2000+ token long-tail requests from spiking P99 TTOT?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4442", "title": "Fault Tolerance for In-Flight Requests During Decode Node Failure", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you recover the 200 active generations after a decode node failure without restarting them from scratch?", "chain_ids": ["cloud-chain-auto-020-06"], "chain_positions": {"cloud-chain-auto-020-06": 1}, "chain_tiers": {"cloud-chain-auto-020-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4443", "title": "Prefill-Decode Ratio Optimization for LLM Serving Fleet", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What optimal prefill-to-decode GPU ratio would you choose to maximize GPU utilization for a 70B model with 256-token prompts and 512-token outputs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4444", "title": "Chunked Prefill Optimal Chunk Size Selection", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What chunk size would you choose for 2048-token prompts, and how does it trade TTFT against decode TPOT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4445", "title": "HDR vs NDR KV Cache Transfer Latency", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much P99 latency would 800 Gbps NDR save over 400 Gbps HDR for KV transfer, and what alternative would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4446", "title": "Disaggregated Serving Autoscaler Response Time Analysis", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you autoscale decode capacity for 2x traffic spikes lasting 3 minutes when new nodes take 90 seconds to provision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4447", "title": "Context Caching with Disaggregated KV Stores", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a distributed KV cache for the common 1024-token system prompt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4448", "title": "MI300X vs H100 Decode Pool Economics", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which GPU would you choose for the 70B decode pool, H100 SXM5 or MI300X, based on cost per token and memory capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4449", "title": "SLA-Aware Request Routing Between Prefill and Decode Pools", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you route premium and standard requests across 10 prefill GPUs to meet 100ms and 500ms P99 TTFT SLAs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4450", "title": "Speculative Decoding in Disaggregated Architecture: Draft Pool Design", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the request flow, acceptance sampling, and resource sizing for 7B-draft/70B-target speculative decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4451", "title": "Disaggregated Serving Observability: Key Metrics Dashboard", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which operational metrics should an SRE watch to distinguish prefill-pool, KV-transfer, and decode-pool problems?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4452", "title": "Prefill Batch Composition for Throughput Maximization", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you batch prompts from 32 to 4096 tokens to reduce the 60% padding waste?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4453", "title": "KV Cache Eviction Policy for Decode Pool Memory Management", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do >4K-token arrivals trigger vLLM preemption cascades at 95% KV utilization, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4454", "title": "Disaggregated Serving Cost Model: Build vs Buy", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the startup build a disaggregated Llama-3 70B system or buy a managed API, and what is the break-even volume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4455", "title": "Multi-Tenant KV Cache Isolation in Disaggregated Serving", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you isolate KV cache usage so tenant A's 10K-request burst cannot preempt tenant B's requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4456", "title": "Decode Straggler Detection in Large Batch Disaggregated Serving", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is likely causing 1% of decode requests to see 120ms/token while the median is 40ms/token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4457", "title": "Disaggregated Serving with TPU v5e Prefill Nodes", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is TPU v5e a good choice for 70B prefill nodes versus H100, and what integration issues would you expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4458", "title": "Request Migration Between Decode Nodes for Load Balancing", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you live-migrate decode requests from nodes 1-3 to nodes 4-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4459", "title": "Decode Pool Tensor Parallelism Degree Selection", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do TP=1 MI300X, TP=2 H100, and TP=4 H100 compare for 70B decode throughput, TPOT, and cost per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4460", "title": "Continuous Decode Batching TPOT Fairness Impact", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What fairness and TPOT impact does adding 15 new requests to a 90-request continuous decode batch have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4461", "title": "Disaggregated Serving Tail Latency Root Cause Framework", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically diagnose P99 TTFT of 800ms when P50 is 120ms and utilization, queues, and bandwidth look normal?", "chain_ids": ["cloud-chain-auto-020-05"], "chain_positions": {"cloud-chain-auto-020-05": 2}, "chain_tiers": {"cloud-chain-auto-020-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4462", "title": "Prefill Warmup and JIT Compilation Latency at Service Start", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should the deployment readiness lifecycle prevent traffic routing before JIT compilation and CUDA graph warmup complete?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4463", "title": "Disaggregated Serving Graceful Degradation Under Partial Failure", "topic": "disaggregated-serving", "competency_area": "deployment", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you gracefully degrade service after losing 30% of decode capacity while serving as many users as possible?", "chain_ids": ["cloud-chain-auto-020-06"], "chain_positions": {"cloud-chain-auto-020-06": 2}, "chain_tiers": {"cloud-chain-auto-020-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4464", "title": "Embedding Table Sharding Strategy for 10TB Feature Space", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you shard 50 embedding tables totaling 10TB across 128 GPUs, including the 512GB largest table?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4465", "title": "Online Learning Embedding Staleness and Cache Invalidation", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do 15% of requests use stale embeddings after updates, and how would you fix the cache invalidation mechanism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4466", "title": "Multi-Stage Ranking Latency Budget Decomposition", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you allocate a 100ms recommendation latency budget across retrieval, light ranking, and heavy ranking?", "chain_ids": ["cloud-chain-auto-021-16"], "chain_positions": {"cloud-chain-auto-021-16": 1}, "chain_tiers": {"cloud-chain-auto-021-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4467", "title": "Feature Freshness vs Serving Latency: Pre-Compute vs Real-Time", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which of the 200 features should be pre-computed versus computed at request time, given 50 real-time and 150 hourly features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4468", "title": "Social Graph Embedding Update Frequency for Friend-Aware Rec", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you incorporate friend activity for 1B users at 10K QPS while keeping P99 latency under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4470", "title": "Embedding Dimension vs Model Capacity Tradeoff in RecSys", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the memory and performance tradeoffs of increasing item embeddings from 64 to 256 dimensions for 10M items?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4471", "title": "Real-Time Embedding Table Update Consistency Under Training", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What consistency model would you use when training publishes embedding updates every minute but serving may lag?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4472", "title": "ANN Index Rebuild Latency for Embedding Table Updates", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you architect the system to cut the daily 10M-item HNSW index rebuild from 4 hours to under 30 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4473", "title": "Embedding Gradient Sparsity and Optimizer Choice", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is Adam slowing DLRM training on embedding tables, and which optimizer would you use for embeddings versus dense layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4474", "title": "Two-Tower Model vs Cross-Attention Ranker for Serving Cost", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For ranking 1M candidates at 5K QPS, would you serve retrieval with a two-tower model or a cross-encoder, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4475", "title": "All-to-All Communication Optimization for Distributed DLRM", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you reduce embedding all-to-all communication on 128 GPUs from 40% of DLRM training time to under 10%?", "chain_ids": ["cloud-chain-auto-021-14"], "chain_positions": {"cloud-chain-auto-021-14": 2}, "chain_tiers": {"cloud-chain-auto-021-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4476", "title": "RecSys Cold Start: New Item Embedding Initialization", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you fix the cold-start embedding initialization to prevent new items from losing 3x impressions during their first 7 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4477", "title": "DLRM Training on TPU v5e Pod: Embedding Table Placement", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you place 8TB of embedding tables on a 256-chip TPU pod with only 4TB total HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4478", "title": "Request Deduplication in High-Frequency Recommendation Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a deduplication layer for 100K QPS when 15% of requests are exact duplicates within 100ms?", "chain_ids": ["cloud-chain-auto-021-15"], "chain_positions": {"cloud-chain-auto-021-15": 0}, "chain_tiers": {"cloud-chain-auto-021-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4479", "title": "Exposure Bias and Position Debiasing in Ranking Models", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and correct position bias when positions 1-3 have 5x higher CTR than positions 10-15?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4480", "title": "Sequence Model for Session-Based Recommendation Serving Latency", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is a 15ms Transformer session encoder justified over 2ms session features, and what serving architecture would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4481", "title": "Distributed Training Throughput for 100TB Embedding Cluster", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the distributed training architecture and verify whether 1024 GPUs can process 1T samples in under 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4482", "title": "Serving Diversity vs Relevance Tradeoff in Re-ranking", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a re-ranking algorithm that balances relevance and diversity when 18 of the top 20 items are fashion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4483", "title": "Real-Time Feature Freshness vs Training Distribution Shift", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and fix the train-serve skew from training on hourly batch features but serving second-level aggregates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4484", "title": "Retrieval Recall at 10K in Multi-Stage Recommendations", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Given a launch gate of under 1% NDCG loss versus 50K oracle retrieval, is 70% recall@10K sufficient?", "chain_ids": ["cloud-chain-auto-021-16"], "chain_positions": {"cloud-chain-auto-021-16": 0}, "chain_tiers": {"cloud-chain-auto-021-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4485", "title": "18-Month Video Recommendation System Scaling Plan", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the first 18 months of recommendations to handle 10x yearly growth without over-engineering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4486", "title": "Embedding Table Memory Bandwidth Optimization with Mixed Precision", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you evaluate FP32, FP16, and INT8 row-wise scaling for DLRM embedding tables, and what precision mix would you choose?", "chain_ids": ["cloud-chain-auto-021-17"], "chain_positions": {"cloud-chain-auto-021-17": 1}, "chain_tiers": {"cloud-chain-auto-021-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4487", "title": "Hot Row Replication Strategy for Embedding Serving", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you replicate the top 1000 hot rows across 16 GPUs while keeping embedding updates consistent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4488", "title": "Recall Metric Selection for Production Retrieval Evaluation", "topic": "recommendation-systems-engineering", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do Recall @K, NDCG @K, and Hit Rate @K compare for evaluating the retrieval stage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4492", "title": "MI300X MoE Routing Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate an expert routing mechanism that minimizes cross-node traffic over the 400 Gbps bottleneck while maintaining load balance?", "visual": {"kind": "svg", "path": "cloud-4492.svg", "alt": "Diagram comparing local massive HBM bandwidth versus thin network link between MI300X nodes.", "caption": "MoE Token Routing Network"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4495", "title": "Hierarchical Parallelism Placement on NVSwitch", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Map an 8-way Tensor Parallelism (TP) and 8-way Data Parallelism (DP) strategy to the hardware topology to minimize slow inter-node transfers?", "visual": {"kind": "svg", "path": "cloud-4495.svg", "alt": "Topology diagram showing dense intra-node connections and sparse inter-node connections.", "caption": "NVSwitch vs IB Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4498", "title": "Data Pipeline Throughput Matching", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Determine the minimum CPU throughput in images/second required to prevent the GPU from stalling on data loading?", "visual": {"kind": "svg", "path": "cloud-4498.svg", "alt": "Bar chart comparing throughput stages: Disk IO, CPU Augmentation, PCIe Transfer, and GPU Compute.", "caption": "Throughput Bottleneck Stages"}, "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 2}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4499", "title": "Torus vs Fat-Tree for AllToAll Workloads", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which topology inherently supports a higher bisection bandwidth for worst-case AllToAll traffic, and why?", "visual": {"kind": "svg", "path": "cloud-4499.svg", "alt": "Diagram comparing a layered tree network with a grid-like torus network.", "caption": "Fat-Tree vs Torus"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4501", "title": "LLM Admission Control for Tail Latency", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a queueing-aware admission control policy that limits tail latency while keeping the GPUs saturated?", "visual": {"kind": "svg", "path": "cloud-4501.svg", "alt": "A graph showing an uncontrolled queue growing exponentially versus an admission-controlled queue flattening out.", "caption": "Controlled vs Uncontrolled Queue"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4504", "title": "PCIe Bottleneck in High-Res Image Training", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the potential bottleneck in transferring uncompressed FP32 4K image tensors over PCIe Gen5 to feed the H100 GPUs?", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 3}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4505", "title": "Dynamic MIG Autoscaling on A100", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a scaling strategy using Multi-Instance GPU (MIG) to effectively duty-cycle compute during low-traffic periods without node power-downs?", "chain_ids": ["cloud-chain-auto-secondary-017-37"], "chain_positions": {"cloud-chain-auto-secondary-017-37": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-37": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4506", "title": "Block-wise FP8 KV Cache Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a block-wise FP8 quantization scheme for the KV cache to maximize sequence length while avoiding attention outlier degradation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4507", "title": "LLM Checkpoint Bandwidth Bottlenecks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the NFS aggregate write bandwidth required to checkpoint the model and optimizer states within 30 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4509", "title": "Distributed Vision Dataloader Design", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a data loading pipeline that eliminates GPU starvation and manages network and decoding bottlenecks?", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 5}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4510", "title": "LLM Checkpoint Storage Sizing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify the exact total gigabytes required to save a single full training checkpoint containing all necessary parameters and optimizer states?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4511", "title": "Asymmetric INT8 Quantization Kernels", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Derive weight-only asymmetric INT8 quantization for FP16 weights and explain dequantization during GEMM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4512", "title": "API Gateway Queueing Model", "topic": "queueing-theory", "competency_area": "latency", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Select the appropriate Kendall notation queueing model for this setup and calculate the overall system utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4513", "title": "Shared Memory Tile Sizing for CUDA Matmul", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given a 164KB shared-memory limit per SM, would two 128x128 FP16 tiles fit, and where should they be staged?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4514", "title": "Diurnal Workload Power Scaling", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate a dynamic power-management strategy that minimizes energy waste during off-peak hours?", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 3}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4515", "title": "Dataloader Thread Blocking", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the mechanical reason for the GPU underutilization and how does changing this parameter fix it?", "chain_ids": ["cloud-chain-auto-003-03"], "chain_positions": {"cloud-chain-auto-003-03": 1}, "chain_tiers": {"cloud-chain-auto-003-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4516", "title": "LLM Queueing Wait Time", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Construct the M/M/1 wait time estimate and demonstrate how reducing service time variance via iteration-level scheduling lowers it?", "visual": {"kind": "svg", "path": "cloud-4516.svg", "alt": "Hockey-stick curves comparing M/M/1 and M/D/1 wait times as utilization approaches 1.0.", "caption": "Queue wait times spike non-linearly at high utilization."}, "chain_ids": ["cloud-chain-auto-024-13"], "chain_positions": {"cloud-chain-auto-024-13": 4}, "chain_tiers": {"cloud-chain-auto-024-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4518", "title": "H100 Data Loading Pipeline Creation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the data pipeline throughput requirements for storage read, CPU decoding, and PCIe Gen5 transfer to prevent GPU starvation?", "visual": {"kind": "svg", "path": "cloud-4518.svg", "alt": "Horizontal bar chart showing throughput jumps from Storage to CPU to PCIe to GPU.", "caption": "H100 Computer Vision Pipeline Bandwidth Stages."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4523", "title": "MI300X Rail-Optimized MoE", "topic": "interconnect-topology", "competency_area": "networking", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the grouping for Expert Parallelism to minimize inter-node traffic bottlenecks?", "visual": {"kind": "svg", "path": "cloud-4523.svg", "alt": "Topology placement showing 8 nodes connected vertically by rail switches.", "caption": "Rail-optimized topology mapping for 8 nodes."}, "chain_ids": ["cloud-chain-auto-026-08"], "chain_positions": {"cloud-chain-auto-026-08": 3}, "chain_tiers": {"cloud-chain-auto-026-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4524", "title": "H100 Parquet Prefetch Sizing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Determine the minimum prefetch buffer size per node to hide the S3 network latency entirely?", "visual": {"kind": "svg", "path": "cloud-4524.svg", "alt": "Throughput stages showing Network Latency gap hidden by Prefetch Buffer.", "caption": "Latency hiding via prefetch buffering."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4526", "title": "Llama-3 70B KV Cache Sizing", "topic": "kv-cache-management", "competency_area": "architecture", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Compute the total memory allocated for the KV cache?", "visual": {"kind": "svg", "path": "cloud-4526.svg", "alt": "Bar chart showing KV cache size exceeding total H100 memory.", "caption": "KV cache memory demand vs single GPU capacity."}, "chain_ids": ["cloud-chain-auto-019-01"], "chain_positions": {"cloud-chain-auto-019-01": 0}, "chain_tiers": {"cloud-chain-auto-019-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4528", "title": "H100 JSONL Decompression Bound", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the throughput bottleneck and state the maximum processing rate of the pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4529", "title": "Multi-Modal Distributed Ingestion Architecture Specification", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify a loader that keeps 10,000 H100s at 60% use, assuming each GPU needs 64 preprocessed 8MB samples every 1s over PCIe Gen5?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4530", "title": "H100 Burst Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-offs between deep sleep states and idle power overhead when managing these intermittent traffic bursts?", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 2}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4533", "title": "MI300X Huge Embedding Placement", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should the 1TB embedding tables be partitioned across the host CPU memory and the MI300X's 192 GB HBM3 to optimize training speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4534", "title": "H100 Distributed Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a distributed checkpointing strategy to minimize the time spent stalling the GPUs while saving model state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4536", "title": "MoE Tiered Caching", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a tiered caching strategy that minimizes tail latency for expert retrieval while adhering to HBM bandwidth and PCIe Gen5 limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4539", "title": "W8A16 KV Cache Expansion", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the W8A16 weight footprint, then compute the maximum sustainable batch size given the 32-user, 4096-token KV cache requirement, and determine if W8A16 is sufficient?", "validated": true, "math_verified": true, "human_reviewed": {"status": "verified", "by": "expert", "date": "2026-04-28"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4540", "title": "H100 Budget Feasibility", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you derive the required duration and prove whether this budget is sufficient given the GPU's theoretical FLOPs and an estimated 40% Model Flops Utilization?", "chain_ids": ["cloud-chain-auto-secondary-015-06"], "chain_positions": {"cloud-chain-auto-secondary-015-06": 2}, "chain_tiers": {"cloud-chain-auto-secondary-015-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4541", "title": "MI300X DVFS Latency Penalty", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how reducing the GPU clock frequency via DVFS affects both the dynamic power consumption and the latency of individual inference requests?", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 0}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4543", "title": "H100 UVM Graph Streaming", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify a hardware-aware memory strategy utilizing the memory hierarchy to train this model efficiently without running out of HBM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4545", "title": "Memory Bandwidth Limits of Large Model Generation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the minimum quantization bit-width required to fit the model, and what is the theoretical minimum token generation latency for a batch size of 1?", "chain_ids": ["cloud-chain-auto-014-09"], "chain_positions": {"cloud-chain-auto-014-09": 3}, "chain_tiers": {"cloud-chain-auto-014-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4547", "title": "Training Time Estimation for Large Language Models", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate an equation for the total compute required in FLOPs, and estimate the training time assuming 40% Model Flops Utilization (MFU)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4551", "title": "HBM Sharding for MoE Serving", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "cloud", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a memory sharding and offloading strategy to maximize throughput for the skewed traffic while supporting full model routing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4553", "title": "LoRA Adapter Memory Footprint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the memory overhead of the adapters against the KV cache assuming batch size 32, 2000 sequence length, 32 layers, and 4096 hidden dimension?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4554", "title": "GPU Deep Sleep Energy Savings", "topic": "duty-cycling", "competency_area": "power", "track": "cloud", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total energy savings in kWh if 80 idle nodes (8 GPUs each) are put into deep sleep for 12 hours?", "chain_ids": ["cloud-chain-auto-secondary-017-36"], "chain_positions": {"cloud-chain-auto-secondary-017-36": 1}, "chain_tiers": {"cloud-chain-auto-secondary-017-36": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4557", "title": "Ring AllReduce Bottleneck", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "cloud", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the expected communication time per step using Ring AllReduce on 16 nodes to diagnose the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4558", "title": "H100 Multimodal Pipeline Pre-fetching", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design a data loading and staging pipeline to prevent the compute units from starving while handling massive video IO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4561", "title": "Asynchronous Hierarchical Checkpointing for Trillion Parameter Models", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "cloud", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an asynchronous, hierarchical checkpointing system that minimizes blocking time on the GPUs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4570", "title": "MI300X Throughput Stalls and Checkpoint I/O", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether this periodic stall is caused by the global data shuffling pipeline or an implicit checkpointing configuration?", "chain_ids": ["cloud-chain-auto-003-01"], "chain_positions": {"cloud-chain-auto-003-01": 3}, "chain_tiers": {"cloud-chain-auto-003-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4571", "title": "MoE Autoscaling Cluster", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "cloud", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a serving cluster architecture and memory allocation specification that dynamically shifts resources without massive cold-start latencies?", "chain_ids": ["cloud-chain-auto-001-20"], "chain_positions": {"cloud-chain-auto-001-20": 2}, "chain_tiers": {"cloud-chain-auto-001-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4573", "title": "Hardware NVDEC Offloading", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "cloud", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the quantitative bandwidth impact of shifting video decoding from CPU to the GPU's hardware NVDEC engines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "cloud-4578", "title": "PCIe Bottleneck for Swapping LoRA Adapters", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "cloud", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the latency penalty added to token generation strictly from fetching these adapters over PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0001", "title": "The Camera Data Deluge", "topic": "mlops-lifecycle", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Before any processing or compression, what is the approximate bandwidth you must provision to move raw pixel data from the sensor to the SoC's memory for this single camera stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~600 KB/s", "~6 MB/s", "~60 MB/s", "~6 GB/s"], "correct_index": 2}}, {"id": "edge-0002", "title": "The Sensor's Front Door Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the most likely component causing the dropped or corrupted frames before they reach main memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The LPDDR5 main memory bandwidth is too low to handle six streams.", "The Jetson's GPU is not powerful enough to process the incoming frames.", "The MIPI CSI-2 camera interconnect bandwidth is saturated.", "The NVMe SSD is too slow to store the incoming video frames."], "correct_index": 2}}, {"id": "edge-0003", "title": "The Sensor Bandwidth Limit", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the approximate maximum data rate you can expect to transfer from a standard 4-lane MIPI CSI-2 interface into the SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~205 GB/s", "~50 GB/s", "~2.5 GB/s", "~125 MB/s"], "correct_index": 2}}, {"id": "edge-0004", "title": "The Production Data Glitch", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What data rate does the 1920x1080 8-bit 200 FPS camera produce, and could USB 3.0 be causing the glitches and accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~41.5 MB/s. The data rate is low, so the bottleneck must be elsewhere.", "~3.3 Gbps. This is comfortably within the 5 Gbps theoretical limit of USB 3.0, so the interface is not the issue.", "~415 MB/s. This rate saturates the real-world throughput of a USB 3.0 interface, likely causing dropped frames and data corruption.", "~2.1 MB/s. The data rate is trivial; the issue is likely the model's processing speed on the Hailo-8."], "correct_index": 2}}, {"id": "edge-0005", "title": "The Automotive I/O Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which component typically offers higher data bandwidth: the AGX Orin's LPDDR5 memory system or the MIPI CSI-2 camera interface that feeds it?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 0}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly equal, as they are designed to be balanced.", "The LPDDR5 memory system, by about 80x.", "The MIPI CSI-2 camera interface, by about 10x.", "The LPDDR5 memory system, but only by a small amount (~2-3x)."], "correct_index": 1}}, {"id": "edge-0008", "title": "The Sensor Bandwidth Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the approximate maximum data rate a standard 4-lane MIPI CSI-2 camera interface can sustain?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~25 MB/s", "~250 MB/s", "~2.5 GB/s", "~25 GB/s"], "correct_index": 2}}, {"id": "edge-0009", "title": "The Frame Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is your colleague dangerously wrong regarding the 30 FPS frame budget?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 2}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0013", "title": "The Jittery Robot Arm", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the common system-level culprits for this jitter, and how would you diagnose and mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0015", "title": "The AV Pipeline Stall", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most likely cause of the 5.5ms overhead you are observing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's operators are not fully supported by the Jetson AGX Orin's hardware, causing frequent CPU fallbacks.", "The 100 Gbps interconnect is the bottleneck; transferring the 64 MB feature map is taking ~5.1ms.", "Real-time OS scheduling jitter and CUDA kernel launch overhead are consuming the extra 5.5ms.", "The memory bandwidth on one of the Orin modules is saturated, slowing down its 12.5ms computation."], "correct_index": 1}}, {"id": "edge-0016", "title": "The AV Perception Pipeline Stall", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Given this data, how would you diagnose and solve this pipeline stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too slow. It must be quantized from FP16 to INT8 to reduce its runtime below 16ms per frame.", "The Jetson AGX Orin is memory bandwidth-bound. The 6 video streams are saturating the 204.8 GB/s LPDDR5 bus.", "The system is executing the 6 camera streams serially. The tasks should be parallelized using CUDA streams to run concurrently.", "The Jetson AGX Orin lacks sufficient compute. 275 TOPS isn't enough for 6 cameras and must be upgraded."], "correct_index": 2}}, {"id": "edge-0017", "title": "The Pipeline Overlap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Without changing any model or buying new hardware, how do you improve throughput and reliably meet the 30 FPS target?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 3}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0018", "title": "The Thermal Throttling Deadline Miss", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the latency increase so sharply?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 2}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0020", "title": "The TensorRT vs ONNX Runtime", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should you switch to ONNX Runtime for faster deployment, or is there a better approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0021", "title": "The Rainy Day mAP Cliff", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can moderate rain create an mAP cliff, and how should the FL pipeline treat that field signal?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0023", "title": "The Camera-to-Inference Latency Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's going wrong, and how do you fix it without buying faster hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0024", "title": "The WCET Analysis Wall", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is measurement-based WCET insufficient for safety certification, and what does a valid WCET analysis require for a neural network?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 4}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0025", "title": "The RTOS vs RT-Linux Tradeoff", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Who is right, and how do you design a system to resolve this conflict?", "chain_ids": ["edge-chain-auto-001-09"], "chain_positions": {"edge-chain-auto-001-09": 2}, "chain_tiers": {"edge-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0026", "title": "The Hard Real-Time Challenge", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the software stack on an edge SoC to ensure deterministic performance, especially when running complex ML models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0028", "title": "The ROS 2 IPC Overhead", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is ROS 2 doing that costs 35ms, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0029", "title": "The Python GIL Multithreading Trap", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did Python multithreading fail to improve the framerate on the 4-core Raspberry Pi?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0030", "title": "The Adaptive Quality Ladder", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What system design maintains 30 FPS across varying scene complexities without changing hardware?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0031", "title": "The Solar Panel Degradation Budget Squeeze", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does a 24% panel degradation cause a 5-hour blackout, and how do you adapt the ML workload?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0033", "title": "Edge-Cloud Hybrid Inference Break-Even", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Calculate the break-even point — when does on-device become cheaper than cloud?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0034", "title": "The Multi-Tenant Edge Scheduler", "topic": "safety-certification", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a multi-tenant scheduler so loss prevention meets its deadlines during Black Friday?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0035", "title": "The Preemption Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why didn't the High-Priority ML Thread just preempt the Low-Priority thread?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0036", "title": "The WCET Analysis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you construct the worst-case execution time (WCET) argument?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 5}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0037", "title": "The Sensor Aging Silent Accuracy Rot", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you detect and correct fleet-wide gradual degradation that's invisible at the individual device level?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0038", "title": "The V2X Latency Requirement", "topic": "safety-certification", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can you meet the 10ms budget using standard 5G routing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0041", "title": "The Post-Replacement Camera Miscalibration", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What did the technician miss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0043", "title": "The Dropped Frame Dilemma", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a robust, real-time sensor data pipeline that handles variable ML inference times and prevents data loss or desynchronization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0048", "title": "The YUV Conversion Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is the NPU only running at 40% utilization while you are dropping frames?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 2}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0049", "title": "The GPS Time Sync Sensor Fusion Failure", "topic": "ota-firmware-updates", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's causing the systematic offset?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0051", "title": "Sensor Fusion Latency Budget", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can you meet the 50ms deadline, and what is the maximum sensor-to-output latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0053", "title": "The Asynchronous Orchestra", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you synchronize these disparate sensor inputs for real-time ML perception without introducing excessive latency or data staleness, especially when ML inference itself takes time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0054", "title": "The Camera VSync Tearing", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is your latency violently oscillating between 12ms and 28ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0056", "title": "The Radar-Camera Fusion Latency", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where is the extra 25ms hiding, and when does the fusion latency overhead negate the benefit of having radar?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0059", "title": "Multi-Sensor Calibration Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you automate fleet-wide sensor calibration to maintain sub-0.1° rotation and sub-2cm translation without taking vehicles offline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0060", "title": "The Perpetual Calibration Problem", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you design a system that maintains high-precision sensor fusion and localization accuracy over half a decade without human intervention for recalibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0061", "title": "Diagnosis: Root Cause of Thermal Throttling in Sealed Edge Enclosures", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What caused the gradual performance degradation, and how can it be fixed?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 1}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0063", "title": "The Battery Saver", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you maximize battery life while ensuring reliable detection and responsiveness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0064", "title": "Battery Life for Solar-Powered Edge Device", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Estimate the daily energy budget and determine if the system can run indefinitely on the 10W solar panel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0065", "title": "The Power State Machine", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the power state machine to meet the <200ms wake target and 60-day battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0066", "title": "The Thermal Zone Juggle", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the sudden latency increase for the GPU model despite being under the system TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0067", "title": "The Silent Slowdown", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely cause of this 'silent slowdown,' and how would you design the system to guarantee sustained performance?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0068", "title": "The Overheating Robot Dog", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the robot dog's system to maintain acceptable navigation performance despite long-duration thermal constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0069", "title": "The Duty Cycling Power Budget", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you run a 45W workload on a 30W thermal budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0071", "title": "The Overheating Vision Pipeline", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely culprit for this performance drop, and how does it impact real-time design?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0072", "title": "The DVFS Latency Jitter", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's causing the latency spikes, and how do you guarantee the 25ms deadline?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 3}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0073", "title": "The Power-Over-Ethernet Budget", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much power budget remains for the AI accelerator under standard PoE, and which edge chips actually fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0075", "title": "Thermal Headroom in a Sealed Enclosure", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal headroom do you have in summer and winter, and what thermal-aware inference policy would you design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0076", "title": "The Thermal Throttling Dilemma", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the system to maintain a minimum acceptable performance level under varying thermal and power constraints without completely failing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0077", "title": "The Thermal Staircase", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does performance degrade in steps rather than gradually, and how do you design around it?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0078", "title": "The Thermal Derating Curve", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How is thermal derating causing the 30 FPS to 21 FPS drop, and what thermal resistance is required to safely sustain 25W at 45°C ambient?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 3}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0081", "title": "The Throttled Vision System", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the most likely cause of the sudden drop in NPU utilization and FPS during continuous operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0082", "title": "The Factory Floor EMI Ghost", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the root cause, and how do you fix it without moving the devices?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0086", "title": "Thermal-Aware Inference Scheduler", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule 5 perception models to prevent thermal throttling while keeping safety-critical latency under 33ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0088", "title": "The Perpetual Sensor", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What architecture and power management strategies would let a motion-triggered wildlife camera run for 5 years on a small battery pack?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0089", "title": "The Energy-Aware Reconnaissance Drone", "topic": "compound-ai-systems", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design adaptive inference for a 6-hour drone mission when ML power drops to 20-30% during maneuvers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0090", "title": "The Solar-Powered Edge Budget", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What power budget and maximum inference rate can the 20W solar, 100Wh bird-classifier station sustain with 5 hours of sunlight per day?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0092", "title": "The TensorRT Engine Portability Trap", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What went wrong with the TensorRT engine migration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0093", "title": "The Model Pruning Speedup Myth", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 90% sparsity give no speedup on the Jetson Orin NX, and what kind of pruning would actually help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0094", "title": "The Optimization Ladder", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is designing a custom architecture the wrong first step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0095", "title": "The Pruning Paradox", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the same pruning give wildly different speedups on different hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0096", "title": "The Safety Watchdog Timer", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What caused the safety watchdog timer to fire despite the 35ms average inference time?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 2}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0097", "title": "The RTOS Interconnect Crisis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which IPC mechanisms would you use on the RTOS to meet real-time deadlines, move large buffers efficiently, and isolate process failures?", "chain_ids": ["edge-chain-auto-001-09"], "chain_positions": {"edge-chain-auto-001-09": 0}, "chain_tiers": {"edge-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0098", "title": "The Edge-Cloud Hybrid Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What 4G offloading policy should handle the 5% ambiguous crop images, and does the hybrid approach actually help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0099", "title": "The Deterministic Inference Mirage", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing the 2.7ms timing variation on identical inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0100", "title": "The Update Blind Spot", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary security flaw in an update design that deletes the old model before downloading and loading the new one, taking 45 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0101", "title": "The Memory Copy Ceiling", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the memory flow diagram, where are the 'missing' milliseconds being spent, and how do you hit the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0102", "title": "The Model Cloning Waste", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the VRAM allocation diagram, why is your system using 16% more memory than it needs to?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 3}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0103", "title": "The Bandwidth Bankruptcy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the data flow diagram, what is the economic bottleneck in your architecture, and how do you resolve it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0104", "title": "The Sealed Oven Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on the thermal diagram, what is the physical flaw in your cooling strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0105", "title": "The Rolling Shutter Tear", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Based on the exposure diagram, what physical phenomenon is destroying your accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0106", "title": "The Memory Pressure Leak", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the producer-consumer diagram, why is your memory usage increasing over time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0107", "title": "The Memory Copy Choke", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the data path diagram, what is the 'silent' task consuming all your CPU cycles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0109", "title": "The Sequential Serializer", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the utilization diagram, how would you collapse this timeline to fit the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0110", "title": "The Bus Priority Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Based on the SoC architecture described here, what physical component is causing the NPU to stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0111", "title": "The Thermal Throttle", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Between Chip A (80 TOPS peak, 40W TDP) and Chip B (25 TOPS peak, 8W TDP), which will deliver higher sustained performance within the drone's 10W thermal envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Chip A, providing 80 TOPS.", "Chip B, providing 100 TOPS.", "Chip A, providing 20 TOPS.", "Chip B, providing 25 TOPS."], "correct_index": 3}}, {"id": "edge-0112", "title": "The Case of the Missing Gigabytes", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Besides the model itself and the OS, what other major consumer of DRAM on an edge device accounts for the multi-gigabyte discrepancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Page file / swap space on the eMMC storage.", "Memory fragmentation from other running processes.", "The OS kernel, which typically uses a few hundred megabytes.", "A large, pre-allocated DMA buffer for the camera sensor stream."], "correct_index": 3}}, {"id": "edge-0114", "title": "The ViT Memory Wall on the Edge", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For the ViT, what is the most likely primary performance bottleneck that could cause it to miss the 33ms deadline?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 0}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The total number of FLOPs in the MLP blocks", "The flash storage required for the model's parameters", "Memory bandwidth saturation from the quadratic complexity of self-attention", "The latency of the initial patch embedding (stem) layer"], "correct_index": 2}}, {"id": "edge-0116", "title": "The Throttled Robot", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate sustained INT8 TOPS you can expect to achieve in this mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "~140 TOPS", "~70 TOPS", "15 TOPS"], "correct_index": 2}}, {"id": "edge-0117", "title": "Pruning for Parallelism", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To achieve a real-world speedup with the TensorRT runtime, which fundamental type of pruning should you recall as the most effective starting point?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0118", "title": "The 'Open-Case' Vulnerability", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which physical interface on the device's circuit board represents the most direct and highest-bandwidth point of attack for this data injection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The I2C bus used for sensor configuration.", "The encrypted LPDDR5 DRAM where the model is stored.", "The MIPI CSI-2 camera interface.", "The UART serial console used for debugging."], "correct_index": 2}}, {"id": "edge-0121", "title": "The INT8 Memory Footprint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 25M parameters, how much memory do the weights require in FP32 versus INT8, and how much is saved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 MB (FP32) and 25 MB (INT8), a 2x reduction.", "200 MB (FP32) and 25 MB (INT8), an 8x reduction.", "100 MB (FP32) and 25 MB (INT8), a 4x reduction.", "100 MB (FP32) and 12.5 MB (INT8), an 8x reduction."], "correct_index": 2}}, {"id": "edge-0122", "title": "The Transformer's Quadratic Curse", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does computational cost scale when increasing from 320x320 to 640x640 for a CNN versus a Vision Transformer (ViT)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both architectures scale linearly, leading to a ~4x increase in FLOPs.", "The CNN's cost scales ~4x, while the ViT's scales ~16x.", "The CNN's cost scales quadratically (~16x), while the ViT's is linear (~4x).", "The cost increase is negligible (~2x) for both due to hardware acceleration."], "correct_index": 1}}, {"id": "edge-0124", "title": "The Thermal Throttling Trap: Power Budgeting", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What sustained INT8 performance should you design around under the robot's 30W thermal limit, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["138 TOPS, because performance scales linearly with the power budget.", "275 TOPS, the device will just run hotter but should maintain its advertised performance.", "Around 170 TOPS, as the device selects its more efficient 30W DVFS power profile.", "26 TOPS, which is the peak performance of other efficient edge accelerators in that power range."], "correct_index": 2}}, {"id": "edge-0125", "title": "The Structured Sparsity Speedup", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What theoretical speedup should 2:4 structured pruning deliver for this compute-bound model in TensorRT?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No speedup, it only saves memory.", "1.5x speedup, due to framework overheads.", "2x speedup.", "4x speedup."], "correct_index": 2}}, {"id": "edge-0126", "title": "The Compromised Robot", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which first-line defense should make the robot verify signed model artifacts before loading tampered weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Encrypt the model file stored on the robot's flash memory.", "Retrain the vision model using adversarial training to make it robust to sticker attacks.", "Enable Secure Boot and require signed model verification in the trusted loader.", "Configure a firewall on the robot to block all incoming network traffic."], "correct_index": 2}}, {"id": "edge-0127", "title": "The Power Efficiency Fallacy", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Between a Jetson AGX Orin and a Hailo-8, which device is fundamentally more power-efficient in terms of TOPS/W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Jetson AGX Orin, because its 275 TOPS is over 10x higher than the Hailo-8's 26 TOPS.", "The Hailo-8, because it delivers over 2x the TOPS/W compared to the Jetson.", "They are roughly equivalent in efficiency; the choice depends on other factors.", "It's impossible to tell without knowing the specific model architecture."], "correct_index": 1}}, {"id": "edge-0128", "title": "The Edge Roofline Trap", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Based on their hardware specs, which device, a Jetson AGX Orin or a Hailo-8, is more likely to be bottlenecked by memory bandwidth rather than its compute units?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Hailo-8, because its compute (26 TOPS) is lower, making it the bottleneck. (Calculated Trap: ignores Ridge Point)", "The Jetson AGX Orin, because its Ridge Point is much higher than classic CNN Arithmetic Intensity.", "Neither, as both use modern memory systems that eliminate bottlenecks. (Calculated Trap: ignores Roofline Model entirely)", "The device with more RAM, as it will try to process more data at once. (Calculated Trap: confuses capacity with bandwidth)"], "correct_index": 1}}, {"id": "edge-0129", "title": "The Unified Memory Contention", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What fundamental hardware constraint is causing the unstable latency and system lag?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU's dedicated HBM memory is too small for the model.", "The CPU and GPU are competing for access to the same shared LPDDR5 DRAM.", "The PCIe bus connecting the CPU and GPU is saturated.", "The Linux OS is swapping model memory to the NVMe drive."], "correct_index": 1}}, {"id": "edge-0130", "title": "The CPU-Free Camera Ingest", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hardware feature lets the camera write frames into system DRAM for the GPU without CPU data copies?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 0}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A high-priority CPU thread performs a `memcpy` from the camera to DRAM.", "The GPU reads pixels directly from the camera sensor's private SRAM.", "Direct Memory Access (DMA) controllers manage the transfer independent of the CPU.", "The camera data is sent to the GPU through a series of L1 cache line fills."], "correct_index": 2}}, {"id": "edge-0131", "title": "The Edge Robot's Memory Tax", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "After the 6 GB system reservation, will the 5B-parameter FP16 model plus 4 GB of activations fit in DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the model's 5B parameters require 20 GB in FP16, which is too large for the 26 GB available after activations.", "No, it won't fit because the 14 GB model requirement exceeds the total system DRAM.", "Yes, it requires 14 GB (10 GB for FP16 weights + 4 GB for activations), which is less than the 26 GB of available DRAM.", "Yes, it fits because the model's 10 GB of weights is less than the total 32 GB of DRAM."], "correct_index": 2}}, {"id": "edge-0132", "title": "The High-Speed Camera's DMA Budget", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the DMA transfer time for one 4K RGB frame over the 4-lane MIPI CSI-2 bus, and does it fit the 60 FPS frame budget?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 1}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the transfer takes ~80 ms, which exceeds the 16.7 ms budget.", "Yes, the transfer takes ~10 ms, which is well within the 16.7 ms budget.", "No, the transfer time is only ~0.12 ms because it uses the Jetson's 204.8 GB/s DRAM bandwidth, but the processing will be the bottleneck.", "Yes, the transfer is nearly instantaneous because DMA operations don't consume bus bandwidth."], "correct_index": 1}}, {"id": "edge-0133", "title": "The 30 FPS Deadline: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the maximum latency budget for the entire ML inference pipeline for a single frame to meet the 30 FPS hard real-time requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16.6 ms", "30 ms", "33.3 ms", "100 ms"], "correct_index": 2}}, {"id": "edge-0134", "title": "Worst-Case vs. Average-Case", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Does this system satisfy the real-time requirement?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 0}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, because the average latency (28ms) is well below the 33ms deadline.", "No, because the worst-case latency (45ms) exceeds the 33ms deadline.", "Yes, because the Jetson AGX Orin has over 200 TOPS, which is sufficient.", "It's impossible to say without knowing the P99.9 latency."], "correct_index": 1}}, {"id": "edge-0135", "title": "The 30 FPS Frame Deadline: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum frame budget in milliseconds for the entire vision pipeline to meet a 30 FPS requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16.7 ms", "30 ms", "33.3 ms", "0.033 ms"], "correct_index": 2}}, {"id": "edge-0136", "title": "Throughput vs. Latency", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 26 TOPS Hailo-8 run a 2 TOPs-per-frame model at 30 FPS, and what maximum FPS can it achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, 26 TOPS is much greater than the 2 TOPS required.", "No, it can only achieve about 7.7 FPS.", "No, it can only achieve 13 FPS.", "Yes, it can run at 520 FPS."], "correct_index": 2}}, {"id": "edge-0137", "title": "The Thermal Handcuffs", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the *maximum sustained performance* you can realistically expect from this chip under that thermal constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4.6 TOPS", "275 TOPS", "69 TOPS", "0.3 TOPS"], "correct_index": 2}}, {"id": "edge-0138", "title": "Crossing the Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Is this model compute-bound or memory-bound on the Orin given 2,000 Ops/Byte versus a 1,342 Ops/Byte ridge point?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is Memory-Bound.", "The model is Compute-Bound.", "The model is I/O-Bound.", "The model is Thermal-Bound."], "correct_index": 1}}, {"id": "edge-0139", "title": "The Orin Utilization Puzzle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this 50 GOps, 100 MB model on the Jetson AGX Orin limited by compute or memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is memory-bound because its Arithmetic Intensity (~500 Ops/Byte) is less than the Orin's Ridge Point (~1342 Ops/Byte).", "The model is compute-bound because the Jetson Orin has a very high peak compute of 275 TOPS.", "The model is compute-bound because its Arithmetic Intensity is high (50 Giga-ops is a large number).", "The model is memory-bound because its power consumption (TOPS/W) would be too high otherwise."], "correct_index": 0}}, {"id": "edge-0140", "title": "The Edge SRAM Speed Advantage", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much faster is accessing this on-chip SRAM compared to accessing the main system LPDDR5 DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x faster.", "~300x faster.", "~25x faster.", "They are about the same speed."], "correct_index": 2}}, {"id": "edge-0141", "title": "Defining the Tensor Arena", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary purpose of this memory region?", "chain_ids": ["edge-chain-auto-secondary-017-43"], "chain_positions": {"edge-chain-auto-secondary-017-43": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To store the model's immutable weight parameters read from flash.", "To dynamically allocate memory for activations using `malloc` as needed.", "To provide a static memory block for all activation tensors, avoiding `malloc`.", "To serve as a high-speed cache for the main system DRAM."], "correct_index": 2}}, {"id": "edge-0142", "title": "The SRAM Tensor Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the peak tensor arena memory usage with buffer reuse, and will it fit in 256 KB of SRAM?", "chain_ids": ["edge-chain-auto-secondary-017-43"], "chain_positions": {"edge-chain-auto-secondary-017-43": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["288 KB, because you sum all the tensors (96+128+64). It will not fit.", "128 KB, because that is the size of the largest tensor. It will fit.", "224 KB, because the peak is the sum of the input and output of the first layer (96+128). It will fit.", "112 KB, because you calculated with INT8 instead of FP16. It will fit."], "correct_index": 2}}, {"id": "edge-0143", "title": "The DMA Offload Dividend", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How long does a 300 KB transfer take using a 10 GB/s CPU memcpy versus a 100 GB/s DMA transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU takes ~1.5 µs, the DMA takes ~0.3 µs. Both are very fast.", "The CPU takes 30 µs, the DMA takes 3 µs.", "Both take roughly the same time, ~1.5 µs, as they are limited by the device's 204.8 GB/s peak memory bandwidth.", "The CPU takes ~240 µs, the DMA takes ~24 µs, because you confused GB/s with Gb/s."], "correct_index": 1}}, {"id": "edge-0145", "title": "The Stereo Vision Memory Squeeze", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Will quantizing two 50 MB FP16 models to INT8 fit within the 60 MB SRAM weight budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the total size is still 100 MB, which is over the 60 MB budget.", "Yes, the total size will be 25 MB, leaving plenty of extra space.", "Yes, the total size will be 50 MB, which fits within the 60 MB budget.", "No, the quantization only provides a 1.5x reduction, resulting in a total size of ~67 MB, which is still too large."], "correct_index": 2}}, {"id": "edge-0148", "title": "The Perception of Speed on Edge Devices", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of the following metrics is the most crucial to minimize to address the user's perception of slowness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time to First Token (TTFT)", "Time per Output Token (TPOT)", "Peak Memory Usage", "Model FLOPS"], "correct_index": 0}}, {"id": "edge-0149", "title": "The Real-Time Deadline Trap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is batching two frames feasible when single-frame inference is 20 ms and the real-time deadline is 33 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, because the amortized time per frame is still 20ms, which is under the deadline.", "No, the total processing time for the batch (40ms) exceeds the 33ms real-time deadline.", "Yes, because batching increases compute efficiency and overall FPS.", "Yes, because the time per frame in the batch becomes 10ms (20ms / 2), which is faster."], "correct_index": 1}}, {"id": "edge-0150", "title": "The Continuous Batching Queue", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "When will Frame A's result be ready, and what is its total latency from arrival at T=10ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30ms, the time it takes for the model to run inference.", "40ms, the time from arrival until the batch is dispatched.", "70ms, the clock time when the inference finishes, yielding a 60ms total latency.", "10ms, because the frame is processed immediately."], "correct_index": 2}}, {"id": "edge-0151", "title": "The Edge Device Power Average", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is its average power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.375 W", "500 mW", "700 mW", "2.5 W"], "correct_index": 2}}, {"id": "edge-0152", "title": "The Passive Cooling Limit", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "To prevent the system from overheating, what is the maximum sustainable duty cycle (the percentage of time the accelerator can be active)?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100%", "60%", "40%", "2.5%"], "correct_index": 2}}, {"id": "edge-0153", "title": "The Kernel Launch Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary overhead that this operator fusion reduces?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 0}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Total floating-point operations (FLOPs) of the model.", "The model's memory footprint on disk.", "Kernel launch overhead and DRAM traffic between layers.", "The peak power draw of the accelerator."], "correct_index": 2}}, {"id": "edge-0154", "title": "The Fusion Overhead Fallacy", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If you fuse these three operations into a single kernel, what is the approximate percent latency reduction assuming a 5µs kernel launch overhead?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 1}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~14.3% latency reduction", "~28.6% latency reduction", "~50.0% latency reduction", "10µs"], "correct_index": 1}}, {"id": "edge-0156", "title": "The Watchdog Timer and the Checkpoint Tax", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How does the 1ms checkpoint affect the 50ms watchdog budget, and what maximum inference latency is allowed when checkpointing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["43ms", "49ms", "42ms", "42.9ms"], "correct_index": 2}}, {"id": "edge-0157", "title": "The Dusty Lens Problem", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much of the 33ms budget is spent just reading one 1920x1080 8-bit grayscale frame from DRAM for contrast normalization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~830 µs", "~1.2 ms", "~10 µs", "~81 µs"], "correct_index": 2}}, {"id": "edge-0158", "title": "The Communication Tax of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From an operational cost perspective, what factor do you need to identify as the primary driver of your budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The compute cost of aggregating the one million models on the central server.", "The power drawn by the one million edge devices to train the model locally.", "The network bandwidth cost to transfer all model updates to the cloud.", "The storage cost for the historical archive of all global models."], "correct_index": 2}}, {"id": "edge-0159", "title": "The Federated vs. Centralized Upload Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 1,000 cameras, how do daily upload volumes compare between centralized image uploads and federated MobileNetV3 weight updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: ~100 MB, Federated: ~8 MB. Centralized is more expensive but only by about 12.5x.", "Centralized: ~100 GB, Federated: ~16 GB. Centralized is ~6.25x more expensive.", "Centralized: ~100 GB, Federated: ~8 GB. Centralized is ~12.5x more expensive.", "Centralized: ~100 GB, Federated: ~4 GB. Centralized is ~25x more expensive."], "correct_index": 2}}, {"id": "edge-0160", "title": "The Edge Efficiency Metric", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate TOPS-per-Watt efficiency of a Hailo-8 accelerator at its nominal power envelope?", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1 TOPS/W", "~4.5 TOPS/W", "~10 TOPS/W", "~26 TOPS/W"], "correct_index": 2}}, {"id": "edge-0161", "title": "The Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a CNN at 500 Ops/Byte on hardware with a 1,342 Ops/Byte ridge point, is the bottleneck memory or compute?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 0}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute (TOPS)", "Memory Bandwidth (GB/s)", "Power Consumption (Watts)", "On-chip Interconnect"], "correct_index": 1}}, {"id": "edge-0162", "title": "The Edge Ridge Point", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you calculate the Orin's INT8 ridge point from 275 TOPS and 204.8 GB/s, and how does it separate memory-bound from compute-bound models?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 1}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.34 Ops/Byte (Dividing 275 / 204.8 directly without unit conversion).", "~0.0007 Bytes/Op (Inverting the formula: Bandwidth / Compute).", "~1343 Ops/Byte. A model's AI must be higher than this to be compute-bound.", "~1343 Ops/Byte. A model's AI must be lower than this to be compute-bound."], "correct_index": 2}}, {"id": "edge-0163", "title": "The On-Chip vs. Off-Chip Memory Chasm", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is an LPDDR5 DRAM read than an on-chip L2 cache hit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2-3x slower", "~25x slower", "~100x slower", "The speed is the same, just the capacity is different"], "correct_index": 1}}, {"id": "edge-0164", "title": "The Edge LLM's Memory Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If the device needs to support a context window of 4096 tokens in FP16 precision, how much of the Jetson's VRAM will be consumed solely by the KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~402 MB", "~4 GB", "~805 MB", "~1.6 GB"], "correct_index": 2}}, {"id": "edge-0165", "title": "The TinyML Tensor Arena Calculation", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum required size for the tensor arena to execute this specific operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40 KB", "95 KB", "65 KB", "256 KB"], "correct_index": 2}}, {"id": "edge-0167", "title": "The Quantization Memory Payoff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What percentage reduction in weight memory should you expect when quantizing 350M parameters from FP16 to INT8?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A 75% reduction.", "A 100% reduction.", "A 50% reduction.", "A 25% reduction."], "correct_index": 2}}, {"id": "edge-0168", "title": "The Edge Transformer Parameter Tax", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Which first-layer design is more memory-efficient for the 64x64 patch model: self-attention or a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Transformer is cheaper; its parameter cost is 128*128 which is less than the 3*3*128*128 of a standard CNN.", "The CNN is cheaper; it has ~17.5k parameters while the Transformer has ~65.5k, a ~3.7x difference.", "They are roughly equal; the Transformer 4*128*128 cost is similar to a standard convolution 3*3*128*128 cost.", "The Transformer is cheaper; the attention calculation softmax(Q*K^T) has no parameters, making it more efficient."], "correct_index": 1}}, {"id": "edge-0169", "title": "The Real-Time Batching Tax", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental trade-off you must immediately identify for a real-time system like this?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 0}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It strictly improves throughput by amortizing computation, with no significant impact on individual frame latency.", "It primarily reduces the power consumption of the edge device by increasing utilization.", "It increases system throughput but also increases the processing latency for every single frame.", "It mainly increases the required on-chip memory, which is the key constraint."], "correct_index": 2}}, {"id": "edge-0171", "title": "The Continuous Batching Sweet Spot", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What batch size maximizes throughput while keeping worst-case frame latency within 150ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8", "23", "11", "As large as memory allows"], "correct_index": 0}}, {"id": "edge-0172", "title": "The Duty Cycle Power Calculation", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the average power consumption over this 10-second active/sleep cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.0 mW (Calculated Trap: Ignoring sleep power)", "~5.0 mW (Calculated Trap: Unweighted average of power states)", "~1.01 mW", "~10.0 mW (Calculated Trap: Ignoring the duty cycle entirely)"], "correct_index": 2}}, {"id": "edge-0173", "title": "The Edge Compute Ceiling", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Before analyzing the model architecture, what is the advertised peak INT8 compute performance of a single Jetson AGX Orin device?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["26 TOPS", "989 TOPS", "275 TOPS", "100 TOPS"], "correct_index": 2}}, {"id": "edge-0174", "title": "The Watchdog Timer's Deadline", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What does the 500ms watchdog do, and what maximum inference latency remains after reserving 150ms for clean shutdown?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 0}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["500ms. The entire watchdog window is the latency budget.", "650ms. The shutdown time is added to the watchdog timeout.", "350ms. The safe shutdown time must be subtracted from the total budget.", "150ms. The inference budget is equal to the safe shutdown time."], "correct_index": 2}}, {"id": "edge-0175", "title": "The Data Gravity of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single largest operational cost avoided by choosing Federated Learning?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cost of on-device compute for local training.", "The cost of the central server to aggregate model updates.", "The cost of network data transfer to and from the devices.", "The initial hardware cost (CapEx) of the devices."], "correct_index": 2}}, {"id": "edge-0177", "title": "The Efficiency Metric", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing two different chips, what is the single most important efficiency metric to consider?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The chip's maximum theoretical compute performance (peak TOPS).", "The memory bandwidth available per watt.", "Power efficiency: the operations delivered per watt of power consumed.", "The financial cost per TOP of performance ($/TOP)."], "correct_index": 2}}, {"id": "edge-0178", "title": "The Bottleneck Identity", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Is this element-wise operation typically compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Neither, it's bound by the PCIe bus.", "Compute-bound", "Memory-bound", "It is always compute-bound on a GPU."], "correct_index": 2}}, {"id": "edge-0179", "title": "The Perception Model's Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the 50 GOPS per 50 MB inference workload memory-bound or compute-bound on the AGX Orin, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The AI is 1000 Ops/Byte, making the model memory-bound.", "The AI is 1000 Ops/Byte, making the model compute-bound.", "The AI is 1342 Ops/Byte, making the model compute-bound.", "The AI is 1 Op/Byte, making the model memory-bound."], "correct_index": 0}}, {"id": "edge-0180", "title": "The SRAM Tensor Arena", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary purpose of the Tensor Arena memory region?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To permanently store the model's weights in fast memory.", "To serve as a dedicated buffer for Direct Memory Access (DMA) transfers from peripherals.", "To provide a statically-allocated region for tensor activations, avoiding dynamic memory calls.", "To act as a software-managed cache for data stored in slower, external DRAM."], "correct_index": 2}}, {"id": "edge-0181", "title": "The ADAS VRAM Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM do the 44M FP16 parameters require, and what total is needed after adding 14 MB of activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~88 MB. The activation memory is temporary and doesn't count towards the total.", "~704 MB. You need to use the training memory rule of thumb (16 bytes/param).", "~102 MB. The total is the sum of parameter and peak activation memory.", "~58 MB. It's 44MB for weights (INT8) plus 14MB for activations."], "correct_index": 2}}, {"id": "edge-0182", "title": "The Real-Time Sensor Fusion Dilemma", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Where should the 2 MB INT8 weights be stored given 1 MB SRAM, and what hardware should move camera data without CPU bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Store weights in DRAM and have the CPU perform a `memcpy` to move camera data.", "Store weights in DRAM; use a DMA controller to move data from camera to DRAM and from DRAM to SRAM for processing.", "The model won't fit because the 2 MB of weights are larger than the 1 MB of SRAM.", "Store weights in DRAM and have the compute engine access them directly from DRAM for every calculation."], "correct_index": 1}}, {"id": "edge-0183", "title": "The Quantization Energy Dividend", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate energy saving for a single compute operation when switching from FP32 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2× more efficient", "~4× more efficient", "~18× more efficient", "~100× more efficient"], "correct_index": 2}}, {"id": "edge-0186", "title": "The Perception Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the vehicle can react safely to its environment, what is the maximum permissible end-to-end latency for processing a single frame, often referred to as the 'frame budget'?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 0}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "16 ms", "33 ms", "1 ms"], "correct_index": 2}}, {"id": "edge-0187", "title": "The Real-Time Deadline Violation", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If batch size doubles to 2 and inference scales linearly from 20ms, can the system still meet the 33ms frame deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system becomes more efficient because average throughput increases.", "The system violates its deadline because the batch processing time exceeds the per-frame budget.", "The system is fine because the average processing time per frame is still 20ms.", "The deadline is missed, but it is okay because the GPU is more utilized."], "correct_index": 1}}, {"id": "edge-0188", "title": "The Continuous Batching Throughput Advantage", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which gives higher TPOT for variable arrivals: static batching of 8 with a 10ms timeout or continuous batching, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Static batching is better because a larger batch size of 8 is more hardware-efficient.", "Continuous batching is better because it minimizes GPU idle time by processing requests from a queue as soon as capacity is available.", "The throughput will be identical because the underlying hardware performance is the same.", "Static batching with a very short timeout (e.g., 1ms) would be better than continuous batching."], "correct_index": 1}}, {"id": "edge-0190", "title": "The Corrupted Sensor Stream", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How can switching from a 4-lane to 2-lane MIPI CSI-2 interface hurt accuracy, and how do their data rates compare if 4 lanes provide 2.5 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 2-lane interface has half the bandwidth (1.25 GB/s), but this only increases latency and doesn't change the image data itself.", "The camera's data rate is only 2.5 GB/s, so it won't be a problem for either interface.", "The 2-lane interface has half the bandwidth (1.25 GB/s), which may force the system to apply lossy compression or drop frames, causing a training-serving skew.", "The MIPI interface bandwidth is irrelevant; the bottleneck would be the Jetson AGX Orin's memory bandwidth (204.8 GB/s), which is much higher."], "correct_index": 2}}, {"id": "edge-0191", "title": "The Energy Cost of Data Movement", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how much more energy is consumed by a single off-chip DRAM access compared to a single FP16 computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10x", "Compute is ~10x more expensive", "~580x", "~50,000x"], "correct_index": 2}}, {"id": "edge-0192", "title": "The Power Efficiency Metric", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing two chips, like a Jetson AGX Orin and a Hailo-8, what is the single most important metric for evaluating power efficiency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak TOPS", "TOPS/W", "Memory Bandwidth (GB/s)", "TDP (Thermal Design Power)"], "correct_index": 1}}, {"id": "edge-0193", "title": "The Roofline Litmus Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Given an arithmetic intensity of 600 Ops/Byte and an INT8 ridge point of 1,342 Ops/Byte, what is the primary performance bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound", "Memory-bound", "Network-bound", "Power-bound"], "correct_index": 1}}, {"id": "edge-0195", "title": "The Edge VLM's KV-Cache Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM is required for the FP16 KV-cache with 24 layers, 16 heads, head dimension 128, and 4096 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["384 MiB", "1536 MiB", "768 MiB", "805 MiB"], "correct_index": 2}}, {"id": "edge-0196", "title": "The Radar's SRAM Ingestion Time", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Interpret the bus specification and calculate the minimum time required to DMA a 256 KB frame over a 2.5 GB/s MIPI bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~102 ms (Unit Error: Confusing GB/s with MB/s)", "~1 µs (Concept Error: Quoting bus latency instead of calculating transfer time)", "~105 µs", "~819 µs (Unit Error: Confusing Gigabytes (GB) with Gigabits (Gb))"], "correct_index": 2}}, {"id": "edge-0197", "title": "The Energy Cost of Precision: Extreme Quantization", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how much energy does a compute operation save when switching from FP32 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Around 4×, because it uses 4 times fewer bits.", "Around 3-4×, the savings for using half-precision (FP16).", "Around 18×, due to the complexity of floating-point vs. integer logic.", "Over 100×, similar to the latency gap between cache and DRAM."], "correct_index": 2}}, {"id": "edge-0200", "title": "The 30 FPS Rule", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To maintain a smooth 30 frames-per-second (FPS) processing rate, what is the hard real-time latency budget for a single frame of computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "16 ms", "33 ms", "1 ms"], "correct_index": 2}}, {"id": "edge-0201", "title": "The Sensor Fusion Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum acceptable system latency before the system violates its deadline and risks a critical failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["33 ms", "28 ms", "5 ms", "61 ms"], "correct_index": 2}}, {"id": "edge-0202", "title": "The Continuous Batching Throughput Gain", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the percentage increase in overall request throughput achieved by using continuous batching?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0%", "55%", "45%", "120%"], "correct_index": 3}}, {"id": "edge-0203", "title": "The Duty Cycle Power Gap", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate active-to-deep-sleep power consumption ratio for a TinyML microcontroller in this acoustic sensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10x", "100x", "1,000x", "100,000x"], "correct_index": 2}}, {"id": "edge-0204", "title": "The Sensor Pipeline Skew", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary ML systems risk of enabling lossy MJPEG compression before the perception model and disk logging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The raw data rate is ~237 MB/s. The risk is that this is still too high for some storage systems.", "The raw data rate is ~373 MB/s. This exceeds the bandwidth of a 4-lane MIPI CSI-2 bus, so compression is mandatory.", "The raw data rate is ~373 MB/s. The key risk is training-serving skew, as the model was not trained on the compression artifacts introduced by MJPEG.", "The raw data rate is ~3,800 MB/s. The main risk is overwhelming the Jetson AGX Orin's memory bandwidth."], "correct_index": 2}}, {"id": "edge-0205", "title": "The Privacy-Preserving Driver Cam", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a system design perspective, what is the primary reason a company would choose Federated Learning over centralized data collection for driver monitoring?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce total training compute cost by distributing it to the edge.", "To achieve higher model accuracy than is possible with centralized training.", "To preserve user privacy by keeping raw video data on the vehicle.", "To allow model training even when vehicles have zero network connectivity."], "correct_index": 2}}, {"id": "edge-0207", "title": "The TOPS vs. TOPS/W Tradeoff", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing chips, what does the 'TOPS/W' (tera-operations per second per watt) metric fundamentally represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak performance: the maximum theoretical compute power of the chip.", "Computational efficiency: the number of operations delivered per watt of power.", "Memory bandwidth: how quickly the chip can access its memory.", "Economic cost: the price of the chip per unit of performance."], "correct_index": 1}}, {"id": "edge-0208", "title": "The Sustained TOPS Reality Check", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "At 15W and 4.6 TOPS/W, what realistic sustained INT8 TOPS should you budget for instead of the 275 TOPS peak?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "4.6 TOPS", "69 TOPS", "26 TOPS"], "correct_index": 2}}, {"id": "edge-0213", "title": "The Voice Assistant's First Word", "topic": "ota-firmware-updates", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the driver perceives the system as highly responsive and 'instant', which of the following metrics is the most critical to minimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT)", "Time To First Token (TTFT)", "End-to-End Generation Time", "Achieved Batch Throughput (requests/sec)"], "correct_index": 1}}, {"id": "edge-0214", "title": "The Real-Time Batching Fallacy", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why will batching two 30 FPS frames violate the 33ms deadline even though single-frame inference takes 20ms?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 1}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It might work if the accelerator is efficient, as total throughput increases.", "It fails because the processing time for two frames (2 x 20ms = 40ms) is longer than the 33ms deadline.", "It fails because the system must wait 33ms for the second frame to arrive before starting the ~35ms batch inference, making the first frame's result available at ~68ms.", "It will work, because the average latency per frame in the batch is less than 33ms."], "correct_index": 2}}, {"id": "edge-0215", "title": "The Hidden Cost of Continuous Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With a 15ms batching timeout and 20ms inference, what latency does an idle single frame see, and does it meet the 33ms deadline?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The frame misses its deadline, as its total latency becomes 15ms (wait) + 20ms (inference) = 35ms.", "The frame meets its deadline, as its inference time is only 20ms, well within the 33ms budget.", "It's impossible to know without knowing the batching overhead of the inference server.", "The frame meets its deadline, because the timeout (15ms) is less than the inference time (20ms)."], "correct_index": 0}}, {"id": "edge-0216", "title": "The Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure TCO (Total Cost of Ownership) perspective on an edge fleet, what is the primary economic reason a company might choose Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It saves power on the edge device by requiring less on-device computation.", "It reduces cloud compute costs by pre-aggregating gradients on the edge.", "It dramatically reduces network bandwidth costs by avoiding raw video uploads.", "It lowers the direct cost of compliance with privacy regulations."], "correct_index": 2}}, {"id": "edge-0221", "title": "Sizing an SRAM Tensor Arena for DMA", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum required size for the tensor arena to run this model without running out of memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["64 KB (Largest single tensor)", "112 KB (Peak concurrent memory)", "160 KB (Sum of all tensors)", "96 KB (Peak at first layer only)"], "correct_index": 1}}, {"id": "edge-0223", "title": "The Activation Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What are the FP16 and INT8 memory footprints for an 80x80x256 activation map, and what memory reduction does INT8 provide?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~6.5 MiB (FP16) and ~1.6 MiB (INT8), a 4x reduction", "~1.6 MiB (FP16) and ~0.8 MiB (INT8), a 2x reduction", "~3.1 MiB (FP16) and ~1.6 MiB (INT8), a 2x reduction", "~3.1 MiB (FP16) and ~0.4 MiB (INT8), an 8x reduction"], "correct_index": 2}}, {"id": "edge-0226", "title": "The Edge vs. Cloud Power Divide", "topic": "energy-per-operation", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure hardware power perspective, roughly how much more power does a single high-end Cloud GPU consume compared to a typical automotive-grade Edge GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 3-5x more power", "About 100x more power", "Over 20x more power", "They consume roughly the same amount of power"], "correct_index": 2}}, {"id": "edge-0229", "title": "The Drone's Data Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Given a hardware ridge point of about 1,342 Ops/Byte, is this model memory-bound or compute-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The model requires 1 TOPS, and the Orin only has 275 TOPS, so it's clearly limited by compute.", "Memory-bound. Its AI of 500 Ops/Byte is less than the Orin's ridge point (~1,342 Ops/Byte), so performance is limited by memory bandwidth.", "Compute-bound. Its AI of 500 Ops/Byte is less than the Orin's ridge point, meaning the processor can't keep up with the required operations.", "Neither. With over 200 GB/s of bandwidth, the device can supply the 2 GB of data almost instantly, so there is no bottleneck."], "correct_index": 1}}, {"id": "edge-0234", "title": "The 30 FPS Perception Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a widely recognized industry-standard deadline for processing a single frame in such a system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 ms", "33 ms", "100 ms", "250 ms"], "correct_index": 1}}, {"id": "edge-0236", "title": "The ISP Night-Driving Skew", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which hardware component in the sensor-to-SoC pipeline is the most likely cause of this training-serving skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The main ML accelerator (e.g., the Jetson's Tensor Cores)", "The DRAM memory controller", "The Image Signal Processor (ISP)", "The MIPI CSI-2 physical interface"], "correct_index": 2}}, {"id": "edge-0237", "title": "The Federated Fleet's Billion-Dollar Upload Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single largest cost that Federated Learning directly addresses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["On-device Compute Power", "Cloud Storage Cost", "Data Transmission Cost (Cellular/Ingress)", "Privacy Compliance Overhead"], "correct_index": 2}}, {"id": "edge-0239", "title": "The Fusion Dividend", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary purpose of applying operator fusion in this context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To decrease the model's parameter count and memory footprint on disk.", "To combine sequential operations into a single kernel, reducing DRAM traffic and latency.", "To increase the model's arithmetic complexity (FLOPs) to achieve higher accuracy.", "To simplify the model's Python code by abstracting multiple layers into a single function call for readability."], "correct_index": 1}}, {"id": "edge-0240", "title": "The Fusion Overhead Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What DRAM traffic do the unfused and fused 1 MB Conv-BatchNorm-ReLU sequence require, and what reduction does fusion achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2 MB", "4 MB", "5 MB", "0 MB, because fusion optimizes compute, not memory."], "correct_index": 1}}, {"id": "edge-0241", "title": "The Fundamental Bottleneck Metric", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the fundamental metric you would calculate to make this determination?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["TOPS/W", "P99 Latency (ms)", "Arithmetic Intensity (Ops/Byte)", "Total Activation Memory (MB)"], "correct_index": 2}}, {"id": "edge-0244", "title": "The INT8 Energy Dividend (edge-0244)", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a pure hardware physics perspective, what is the approximate energy saving for a single compute operation when using INT8 versus FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x. This matches the exact memory capacity reduction (32 bits to 8 bits).", "~18x. This reflects the exponential difference in transistor switching logic between floating-point and integer ALUs.", "~16x. This reflects a squared relationship of the bit reduction.", "~256x. This assumes energy scales exponentially with the number of representable values (2^8 vs 2^32)."], "correct_index": 1}}, {"id": "edge-0247", "title": "The Hard Real-Time Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following metrics is the most important to define as your primary optimization target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Maximizing batch throughput (Frames Per Second)", "Minimizing average latency (P50)", "Minimizing worst-case latency (P99/P100)", "Maximizing power efficiency (TOPS/Watt)"], "correct_index": 2}}, {"id": "edge-0249", "title": "The Federated Fleet's Data Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of the following daily data collection strategies results in the highest data cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Uploading 10MB of model gradients per robot via Federated Learning.", "Uploading 500MB of raw camera images per robot for central training.", "Uploading 1MB of summary statistics per robot from on-device analytics.", "Triggering a 'debug dump' on 1% of the fleet, uploading a 1GB snapshot from each."], "correct_index": 1}}, {"id": "edge-0251", "title": "The Edge Roofline Riddle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a typical CNN, is the workload likely to be compute-bound or memory-bound on this hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the GPU has 275 TOPS and CNNs are computationally intensive.", "Power-bound, because TOPS/W is the most critical metric for an autonomous vehicle.", "Memory-bound, because the model's arithmetic intensity is lower than the hardware's ridge point.", "Neither, a well-optimized model should be balanced perfectly on the ridge point."], "correct_index": 2}}, {"id": "edge-0252", "title": "The Edge Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the Jetson AGX Orin's ridge point in Ops/Byte given 275 TOPS of INT8 performance and 204.8 GB/s memory bandwidth?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.3 Ops/Byte", "~18.3 Ops/Byte", "~1343 Ops/Byte", "~0.74 Bytes/Op"], "correct_index": 2}}, {"id": "edge-0255", "title": "The INT8 Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What will the total memory footprint of the model's weights be after quantizing from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4 GB", "16 GB", "8 GB", "12 GB"], "correct_index": 2}}, {"id": "edge-0258", "title": "The Cost of Data Gravity in Automotive Fleets", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary operational cost that using Federated Learning is designed to mitigate compared to a traditional, fully centralized approach where all data is uploaded to the cloud?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud compute (GPU) costs for model training.", "On-device compute energy consumed during local training.", "Network data transmission (backhaul) costs.", "Cloud storage costs for the raw sensor data."], "correct_index": 2}}, {"id": "edge-0261", "title": "The Autonomous Driving Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this specific layer compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the hardware's ridge point is only ~1.3 Ops/Byte, which the workload far exceeds.", "Compute-bound, because the 400 Giga-Ops workload is a significant fraction of the chip's 275 TOPS capacity.", "It is perfectly balanced, as the arithmetic intensity and ridge point are roughly within a factor of 2x of each other.", "Memory-bound, because its arithmetic intensity (~667 Ops/Byte) is lower than the hardware's ridge point (~1343 Ops/Byte)."], "correct_index": 3}}, {"id": "edge-0264", "title": "The Perception Pipeline's Precision Problem", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What first-order effect will INT8 quantization have on the DRAM write bandwidth for the FP16 activation bottleneck?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It has no significant impact on memory bandwidth, only on compute.", "It reduces the memory bandwidth requirement by 4x.", "It halves the memory bandwidth requirement for the operation.", "It doubles the available hardware memory bandwidth of the Jetson device."], "correct_index": 2}}, {"id": "edge-0266", "title": "The Unstable Perception Queue", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With 30 FPS arrivals and 40 ms processing, how many frames are queued after one second, excluding any frame currently being processed?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 0}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0 frames", "25 frames", "5 frames", "1 frame"], "correct_index": 2}}, {"id": "edge-0267", "title": "The Cellular Data Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the primary economic cost that Federated Learning is designed to reduce in this edge scenario?", "chain_ids": ["edge-chain-auto-017-03"], "chain_positions": {"edge-chain-auto-017-03": 0}, "chain_tiers": {"edge-chain-auto-017-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Stronger user privacy guarantees", "Lower on-device hardware (CapEx) requirements", "Massive reduction in cellular data transmission costs", "Reduced cloud compute (TFLOPS) cost for training"], "correct_index": 2}}, {"id": "edge-0268", "title": "Identifying Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the standard term for the ratio of compute operations to bytes of data moved from memory, which is the primary determinant of whether a workload is compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["TOPS per Watt", "Operational Throughput", "Arithmetic Intensity", "Compute-to-Memory Ratio"], "correct_index": 2}}, {"id": "edge-0269", "title": "The Orin's Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is this specific layer compute-bound or memory-bound on the AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity (800 Ops/Byte) is greater than the Ridge Point.", "Memory-bound, because the model's Arithmetic Intensity (800 Ops/Byte) is less than the Orin's Ridge Point (~1342 Ops/Byte).", "Compute-bound, because 200 GOPS is a very large number of operations.", "Memory-bound, because 250 MB is larger than the Orin's cache."], "correct_index": 1}}, {"id": "edge-0270", "title": "The Edge VRAM Budget: VRAM Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Assuming the model runs in FP16 precision, what is the minimum VRAM required for the model's weights and activations combined?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["60 MB", "150 MB", "210 MB", "180 MB"], "correct_index": 2}}, {"id": "edge-0271", "title": "The INT8 vs FP16 Energy Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much more energy does a single FP16 operation consume compared to an INT8 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x more energy", "~5x more energy", "~18x more energy", "The energy consumption is roughly the same"], "correct_index": 1}}, {"id": "edge-0272", "title": "The INT8 Memory Payoff: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does the storage footprint change, and what is the final size in MB, after quantizing 11M FP16 weights to INT8?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["22 MB", "5.5 MB", "11 MB", "88 MB"], "correct_index": 2}}, {"id": "edge-0274", "title": "The Perception Pipeline's Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is the 30 FPS system safe for a hard 33 ms deadline if the model averages 25 ms per frame?", "chain_ids": ["edge-chain-bucket-realtime-02"], "chain_positions": {"edge-chain-bucket-realtime-02": 1}, "chain_tiers": {"edge-chain-bucket-realtime-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it's safe. The average processing time of 25ms is less than the 33ms deadline.", "No, it's unsafe. The 75% utilization is too high, causing queue delays that will violate the hard real-time deadline.", "Yes, it's safe. It can process 40 frames per second (1000ms / 25ms), which is more than the required 30 FPS.", "No, it's unsafe because the system utilization is over 100% (33.3ms / 25ms)."], "correct_index": 1}}, {"id": "edge-0279", "title": "The SRAM Bottleneck", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the memory footprint of the 112x112x256 FP16 activation tensor, and will it fit in 4 MB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.06 MB; it fits. (Misconception: Assumes 1 byte per element/INT8 precision)", "49.0 MB; it does not fit. (Misconception: Confuses bits with bytes, using 16 bytes for FP16)", "6.13 MB; it does not fit.", "6.42 MB; it does not fit. (Misconception: Uses 1,000,000 instead of 1024*1024 for MB conversion)"], "correct_index": 2}}, {"id": "edge-0283", "title": "The Real-Time Perception Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum per-frame latency allowed for a 30 FPS hard real-time perception model?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 1}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 ms", "30 ms", "33 ms", "100 ms"], "correct_index": 2}}, {"id": "edge-0285", "title": "The Myth of Peak TOPS", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a realistic sustained TOPS number you should use for your initial performance estimates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "15 TOPS", "69 TOPS", "4.6 TOPS"], "correct_index": 2}}, {"id": "edge-0288", "title": "The FP16 vs. INT8 Energy Tax", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much more energy does a single FP16 MAC (Multiply-Accumulate) operation consume compared to an INT8 MAC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2× more energy", "Roughly 5× more energy", "Roughly 18× more energy", "The energy difference is negligible"], "correct_index": 1}}, {"id": "edge-0289", "title": "The Edge Quantization Footprint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the on-disk size reduction when quantizing the 50M-parameter FP16 LiDAR model to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The size is reduced by 100 MB.", "The size is reduced by 400 MB.", "The size is reduced by 50 MB.", "There is no change in memory size, only compute speed."], "correct_index": 2}}, {"id": "edge-0291", "title": "The Perception Pipeline's Processing Deficit", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the 40 FPS system with 30 ms processing meet its real-time deadline, and what is the per-frame deficit or surplus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is unstable; it has a 5ms deficit per frame because processing takes longer than the frame arrival interval.", "The system is stable; it has a 5ms surplus per frame.", "The system is stable; 30ms is faster than the standard 33ms (30 FPS) automotive deadline.", "The system is unstable; it has a 10ms deficit per frame (40ms - 30ms)."], "correct_index": 0}}, {"id": "edge-0293", "title": "The Fleet Data Toll", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What privacy benefit does FL provide, and what is the daily centralized upload cost for 10,000 vehicles at 10 GB each and $0.10/GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$1.00 per day. (This is the cost for a single vehicle, failing to account for the fleet size.)", "$50.00 per day. (This is the cost for the Federated Learning approach, not the requested centralized one.)", "$10,000 per day. (Correct: 10,000 vehicles x 10 GB/vehicle x $0.10/GB)", "$10.00 per day. (This calculation incorrectly assumes 10 MB of data per vehicle instead of 10 GB, a common unit error.)"], "correct_index": 2}}, {"id": "edge-0294", "title": "The Quadratic Bottleneck", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which of the following operations has a computational complexity (FLOPs) that scales quadratically with the number of input patches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A 3x3 standard convolution", "A self-attention layer", "A ReLU activation function", "A depthwise separable convolution"], "correct_index": 1}}, {"id": "edge-0295", "title": "The Deceptive Pointwise Convolution", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the fundamental hardware reason for this poor performance?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 0}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound and the Orin's TOPS are insufficient.", "The device is thermally throttling due to the high workload.", "The layer is memory-bound due to its low arithmetic intensity.", "The CUDA compiler is generating inefficient machine code for 1x1 convolutions."], "correct_index": 2}}, {"id": "edge-0296", "title": "The Orin's Perception Dilemma", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the 20 GOPS per 13.1 MB layer compute-bound or memory-bound, and what sustainable TOPS should you expect at 30W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, ~138 TOPS", "Compute-bound, 275 TOPS", "Compute-bound, ~138 TOPS", "Memory-bound, ~102 TOPS"], "correct_index": 2}}, {"id": "edge-0299", "title": "The Activation Bandwidth Bottleneck", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does INT8 activation quantization improve latency, and what are the 256x256x512 tensor footprints in FP16 and INT8?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 0}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It reduces the activation size from 64 MB to 8 MB, saving 56 MB of memory transfers.", "It makes the model's weights 2x smaller on flash, which speeds up initial model loading time.", "It halves the activation size from 64 MB to 32 MB, reducing the memory bandwidth required to read/write it.", "It saves 32 MB of DRAM, which prevents the OS from having to swap memory pages to disk."], "correct_index": 2}}, {"id": "edge-0302", "title": "The Federated Fleet's Primary Directive", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a systems and economics perspective, what is the primary motivation for choosing federated learning in this context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce total computational cost by offloading training from expensive cloud GPUs to the vehicle's edge processors.", "To achieve higher model accuracy than is possible with centralized training by using more diverse, real-world data.", "To massively reduce network bandwidth costs and preserve user privacy by not uploading raw sensor data.", "To enable faster, lower-latency inference decisions on the vehicle."], "correct_index": 2}}, {"id": "edge-0304", "title": "The Vision Transformer Quadratic Burden", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a fixed input image size, what is the fundamental scaling reason that the ViT self-attention layers are often computationally prohibitive on such devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Vision Transformers always have more parameters than CNNs.", "The self-attention mechanism computational cost scales quadratically with the input sequence length.", "Edge NPUs lack the hardware to efficiently perform the matrix multiplications required by Transformers.", "The activation memory required by the Key-Value cache in Transformers is too large for the SRAM on edge devices."], "correct_index": 1}}, {"id": "edge-0305", "title": "The Power Efficiency Litmus Test", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When comparing chips, what does the TOPS/W (Trillion Operations Per Second Per Watt) metric fundamentally allow you to identify?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The maximum theoretical compute throughput of the chip.", "The chip's performance on memory-bound vs. compute-bound tasks.", "The computational performance delivered per unit of power consumed.", "The dollar cost per trillion operations."], "correct_index": 2}}, {"id": "edge-0306", "title": "The Edge Roofline Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Given an arithmetic intensity of 2,000 Ops/Byte, is the model compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because the model's arithmetic intensity (2000) is greater than the Orin's ridge point (~1342 Ops/Byte).", "Memory-bound, because the peak memory bandwidth (204.8 GB/s) is numerically much smaller than the peak compute (275 TOPS).", "Compute-bound, because the model's arithmetic intensity (2000) is greater than the Orin's ridge point (~1342 Ops/Byte).", "It's impossible to tell without knowing the model's latency in milliseconds."], "correct_index": 2}}, {"id": "edge-0310", "title": "The Autonomous Driving Frame Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "At 275 INT8 TOPS and a 33 ms deadline, what is the maximum theoretical number of TeraOps the model can perform per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["275 TOPS", "8,333 TOPS", "9.075 TeraOps", "9,075 TeraOps"], "correct_index": 2}}, {"id": "edge-0311", "title": "The Economics of Fleet Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary motivation for using FL in this automotive scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To achieve higher model accuracy than centralized training.", "To reduce the on-device compute requirements for inference.", "To reduce data transmission costs and protect user privacy.", "To enable faster model training cycles (wall-clock time)."], "correct_index": 2}}, {"id": "edge-0313", "title": "The Depthwise Separable Cost Advantage", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Approximately how much does this change reduce the number of floating-point operations (FLOPs)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About a 3× reduction", "About a 9× reduction", "About a 256× reduction", "About a 2× reduction"], "correct_index": 1}}, {"id": "edge-0317", "title": "The Perception Model's Memory Diet (edge-0317)", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much memory in MB is saved for the 256x256x128 activation tensor when converting it from FP16 to INT8?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~16.8 MB", "~4.2 MB", "~8.4 MB", "0 MB"], "correct_index": 2}}, {"id": "edge-0320", "title": "The Federated Learning Bandwidth Diet", "topic": "extreme-quantization", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the approximate size of the FP32 gradient update for a 5 million-parameter model?", "chain_ids": ["edge-chain-auto-secondary-006-05"], "chain_positions": {"edge-chain-auto-secondary-006-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["250 MB", "10 MB", "20 MB", "5 MB"], "correct_index": 2}}, {"id": "edge-0322", "title": "The Depthwise Separable Cost Reduction", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately how much compute do you save by replacing standard 3x3 convolutions with depthwise separable convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 100x reduction.", "Roughly 2x reduction.", "Roughly 9x reduction.", "There is no significant computational reduction."], "correct_index": 2}}, {"id": "edge-0323", "title": "The Thermal Budget Trap", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why might a 60W Jetson AGX Orin be unsuitable for a trunk compute module with a strict 30W power budget?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 275 TOPS performance is too low for a perception model.", "Its memory bandwidth is likely insufficient.", "Its 60W power requirement exceeds the system's 30W budget.", "A GPU is the wrong type of processor for this task."], "correct_index": 2}}, {"id": "edge-0324", "title": "The Jetson Roofline Riddle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a layer with 5 trillion INT8 ops and 10 GB of DRAM reads, is it compute-bound or memory-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 5 trillion operations is a massive workload that will always saturate the GPU.", "Memory-bound, because its arithmetic intensity of 500 Ops/Byte is below the ridge point of ~1342 Ops/Byte.", "Compute-bound, because 5 TOPS / 275 TOPS = 1.8% utilization, meaning compute is the bottleneck.", "Memory-bound, because the memory access (10 GB) is larger than the compute (5 TOPS)."], "correct_index": 1}}, {"id": "edge-0328", "title": "The Autonomous Driving FPS Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To maintain the 30 FPS rate, what is the absolute maximum inference latency your ML model can have?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["33.3 ms", "10 ms", "23.3 ms", "43.3 ms"], "correct_index": 2}}, {"id": "edge-0329", "title": "The Privacy-First Powertrain", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From an ML systems perspective, what is the primary reason to choose Federated Learning in this scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce network bandwidth costs from uploading audio files", "To enable faster, real-time model updates for the entire fleet", "To avoid centralizing sensitive user voice data", "To achieve better model accuracy than centralized training"], "correct_index": 2}}, {"id": "edge-0331", "title": "The Units of Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the units of Arithmetic Intensity, the x-axis of this chart?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["FLOPS (Floating Point Operations Per Second)", "Bytes/Op (Bytes per Operation)", "Ops/Byte (Operations per Byte)", "TOPS/W (Trillion Operations per Second per Watt)"], "correct_index": 2}}, {"id": "edge-0332", "title": "The Edge Accelerator Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Using the official specs—275 TOPS and 204.8 GB/s of memory bandwidth—what is the Ridge Point of the Jetson AGX Orin in Ops/Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4.6 Ops/Byte", "~1.3 Ops/Byte", "~1,342 Ops/Byte", "~0.74 Ops/Byte"], "correct_index": 2}}, {"id": "edge-0334", "title": "The Perception Model's Memory Diet (edge-0334)", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 30 million FP16 parameters, what are the FP16 and INT8 weight footprints and total memory saved after quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The reduction is 90 MB.", "The reduction is 60 MB.", "The reduction is 30 MB.", "The reduction is 15 MB."], "correct_index": 2}}, {"id": "edge-0336", "title": "The Federated Learning Privacy Advantage", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a privacy and data transmission perspective, what is the fundamental advantage of the Federated Learning approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized training saves $5,000/day by avoiding on-device compute costs.", "Federated learning compresses the 80,000 GB/day of raw video into an encrypted cloud format.", "It transmits only 1,000 GB/day of weight updates, keeping all 80,000 GB/day of raw PII on-device.", "Federated learning requires 80x more bandwidth because gradients are larger than raw images."], "correct_index": 2}}, {"id": "edge-0340", "title": "The Edge Memory Diet: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the new INT8 weight-memory footprint for a 15 million-parameter FP16 model after quantization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30 MB", "60 MB", "15 MB", "3.75 MB"], "correct_index": 2}}, {"id": "edge-0342", "title": "The Hidden Cost of On-Device Training", "topic": "safety-certification", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the most significant new operational cost introduced by choosing the federated learning approach at this scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased server costs for model aggregation.", "The one-time purchase price (CapEx) of the edge processors.", "The aggregate fleet-wide energy consumption for on-device computation.", "The network cost of transmitting model updates."], "correct_index": 2}}, {"id": "edge-0343", "title": "The Federated Fleet TCO: Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 10,000 vehicles, what are the daily and annual cellular costs of uploading 1 GB raw data versus 10 MB FL updates at $2/GB?", "chain_ids": ["edge-chain-auto-017-03"], "chain_positions": {"edge-chain-auto-017-03": 1}, "chain_tiers": {"edge-chain-auto-017-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: $20,000/day, Federated: $20,000/day. The model update is likely the same size as the raw data.", "Centralized: $2/day, Federated: $0.02/day. The costs are negligible.", "Centralized: $20,000/day, Federated: ~$200/day. Federated is ~100x cheaper.", "Centralized: $200,000/day, Federated: $2,000/day. Both are too expensive."], "correct_index": 2}}, {"id": "edge-0345", "title": "The Transformer's Quadratic Curse on Edge", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For a fixed input image size, which operation is the primary compute bottleneck in the ViT, and how does its computational cost scale as you increase the number of input image patches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MLP Head, which scales linearly with the embedding dimension.", "Convolutional stem, which scales linearly with the number of pixels.", "Self-attention, which scales quadratically (O(N^2)) with the number of patches.", "Layer normalization, which scales linearly with the number of patches."], "correct_index": 2}}, {"id": "edge-0346", "title": "The Edge Roofline: Calculating the Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Given the device's peak performance of 275 TOPS (INT8) and memory bandwidth of 204.8 GB/s, what is its approximate ridge point?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 1}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.34 Ops/Byte", "~4.6 Ops/Byte", "~1343 Ops/Byte", "~0.74 Bytes/Op"], "correct_index": 2}}, {"id": "edge-0349", "title": "The Depthwise Efficiency Gain", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a 112x112 layer with 64 input channels and 128 output channels, what FLOP reduction do you get from a 3x3 depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["An ~8.4x reduction in FLOPs", "A ~9x reduction in FLOPs", "A ~128x reduction in FLOPs", "A ~2x reduction in FLOPs"], "correct_index": 0}}, {"id": "edge-0350", "title": "The Edge Power Budget", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What typical power draw should you assume for a Hailo-8 edge AI accelerator in the fleet TCO analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30W", "700W", "10mW", "2.5W"], "correct_index": 3}}, {"id": "edge-0352", "title": "Edge Power Efficiency 101", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To make the most of your limited power budget, what is the primary metric you should use to evaluate the power efficiency of the compute?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak TOPS", "Memory Bandwidth (GB/s)", "TOPS/W (Throughput per Watt)", "PUE (Power Usage Effectiveness)"], "correct_index": 2}}, {"id": "edge-0357", "title": "The Federated Learning Transmission Trade-off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In this setup, what is primarily transmitted from the vehicle back to the central server to update the global model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Raw sensor data (e.g., images, LiDAR point clouds).", "Anonymized user driving statistics and routes.", "Model updates (e.g., gradients or weights).", "Real-time inference latency and power consumption metrics."], "correct_index": 2}}, {"id": "edge-0358", "title": "The Federated Fleet Cost Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 10,000 vehicles, what is the daily bandwidth cost of uploading 50 GB raw data per car versus a 50 MB FL update at $0.02/GB?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$100,000 (Centralized) vs. $100 (Federated). A 1000x difference.", "$10,000 (Centralized) vs. $10 (Federated). A 1000x difference.", "$1,000,000 (Centralized) vs. $1,000 (Federated). A 1000x difference.", "$10,000 (Centralized) vs. $1,000 (Federated). A 10x difference."], "correct_index": 1}}, {"id": "edge-0360", "title": "The Activation Memory Budget: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the activation memory footprint for a 128x128x64 tensor in FP16 versus INT8?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 MB (FP16) vs. 0.5 MB (INT8)", "256 KB (FP16) vs. 128 KB (INT8)", "2 MB (FP16) vs. 1 MB (INT8)", "16 MB (FP16) vs. 8 MB (INT8)"], "correct_index": 2}}, {"id": "edge-0364", "title": "The Edge Efficiency Trade-off: Standard vs. Depthwise Convolution", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Can you explain the efficiency gain by calculating the approximate reduction in the number of parameters for this single layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The parameter count is reduced by ~2x, as the output channels are double the input.", "The parameter count is reduced by exactly 9x, the size of the 3x3 kernel.", "The parameter count is reduced by ~8.7x.", "The parameter count is reduced by ~256x, because the output channel dimension is factored out of the spatial convolution."], "correct_index": 2}}, {"id": "edge-0368", "title": "The Perception Model Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What trade-off does INT8 quantization introduce, and how much memory do 50 million weights require in FP16 versus INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 MB for FP16, 100 MB for INT8. (Incorrectly treats FP16 as 4 bytes and INT8 as 2 bytes)", "100 MB for FP16, 25 MB for INT8. (Incorrectly applies a 4x reduction factor, common for FP32->INT8)", "100 MB for FP16, 50 MB for INT8.", "50 MB for FP16, 12.5 MB for INT8. (Off by a factor of 2 and then incorrectly applies 4x reduction)"], "correct_index": 2}}, {"id": "edge-0372", "title": "The Depthwise Separable Efficiency Gain (edge-0372)", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For the 256-channel 3x3 layer, how many MACs per output pixel does a depthwise separable version use versus standard, and what is the reduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~256x", "~2x", "~9x", "1x (no change)"], "correct_index": 2}}, {"id": "edge-0373", "title": "The Data Privacy Firewall", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a systems perspective, what is the primary and most fundamental reason to choose Federated Learning in this scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly reduces the amount of computation needed on the vehicle's processor.", "It achieves higher final model accuracy compared to centralized training.", "It keeps sensitive user data on the device, enhancing privacy and radically reducing network costs.", "It allows the vehicle to operate completely offline without any need for a central server."], "correct_index": 2}}, {"id": "edge-0375", "title": "The Efficiency Litmus Test", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the single most important efficiency metric you should use to compare how much compute performance each chip delivers for a given amount of power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Peak TOPS", "Power Draw (Watts)", "TOPS / Watt", "PUE (Power Usage Effectiveness)"], "correct_index": 2}}, {"id": "edge-0378", "title": "The Orin's Ridge Point: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the Jetson AGX Orin's ridge point from 275 TOPS and 204.8 GB/s, and what does that value represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~168 Ops/Byte", "~1.3 Ops/Byte", "~1,342 Ops/Byte", "~0.74 Bytes/Op"], "correct_index": 2}}, {"id": "edge-0381", "title": "The Data Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total daily data upload volume that the cloud ingest system must be designed to handle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 GB", "~1 TB", "~10 TB", "~100 TB"], "correct_index": 2}}, {"id": "edge-0382", "title": "The Federated ROI Calculation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Given centralized costs of 10 GB/car/month at 0.05/GB plus 20/TB storage versus $200k federated CapEx, what is the break-even time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 months (Assuming $10/TB storage)", "2.4 months (Assuming $5,000 OpEx)", "28.5 months", "The centralized approach is always cheaper"], "correct_index": 2}}, {"id": "edge-0386", "title": "The Residual Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of adding two 512x512x256 INT8 tensors with no caching, and is the operation compute- or memory-bound?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 1}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.0 Ops/Byte; it's memory-bound.", "~0.5 Ops/Byte; it's memory-bound.", "~0.33 Ops/Byte; it's compute-bound.", "~0.33 Ops/Byte; it's memory-bound."], "correct_index": 3}}, {"id": "edge-0387", "title": "The Quantization Energy Tax", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a hardware physics perspective, how much more energy does a single FP32 MAC consume than a single INT8 MAC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["FP32 is ~2-4× more expensive.", "They consume roughly the same amount of energy.", "FP32 is ~18× more expensive.", "INT8 is ~4x more expensive."], "correct_index": 2}}, {"id": "edge-0388", "title": "The Quadratic Cliff", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For a 64x64 feature map with 128 channels, how does the compute cost of self-attention compare with a 3x3 convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly equivalent in cost, as they both process the same number of input pixels and have the same channel depth.", "The convolution is more expensive because its 3x3 kernel must scan across the entire feature map.", "The self-attention layer is over 7x more expensive due to its quadratic complexity with respect to the number of pixels.", "The self-attention layer is slightly cheaper because it has fewer parameters than the convolution's pointwise step."], "correct_index": 2}}, {"id": "edge-0389", "title": "The Federated Fleet Cost Equation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single biggest economic advantage of the Federated Learning approach in this scenario?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Lower on-device compute requirements.", "Reduced cloud storage costs for raw data.", "Drastically reduced cellular egress fees.", "Faster model convergence during training."], "correct_index": 2}}, {"id": "edge-0390", "title": "The Federated Fleet Economy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "For 10,000 vehicles at $0.10/GB, what is the daily data cost of a centralized 1 GB/day upload strategy compared to federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$10/day. This incorrectly uses the federated update size (10,000 * 10MB = 100GB) for the calculation.", "$0.10/day. This calculates the cost for only a single vehicle, not the entire fleet.", "$1000/day. This correctly calculates the total data from the fleet and applies the per-GB cost.", "$100,000/day. This represents a 100x calculation error, perhaps by confusing dollars and cents."], "correct_index": 2}}, {"id": "edge-0392", "title": "The Edge Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much weight memory, in GB, is saved by quantizing a 500 million parameter model from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.5 GB", "1.0 GB", "0.5 GB", "4.0 GB"], "correct_index": 2}}, {"id": "edge-0401", "title": "Annual Cellular Cost of Weekly OTA vs Delta Model Updates for a 10,000-Vehicle Fleet", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the annual data cost for sending a 1 GB full-model OTA versus a 100 MB delta update weekly to 10,000 vehicles at $5/GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Delta Update strategy is cheaper at $26,000 annually, versus $260,000 for the OTA strategy.", "The Delta Update strategy is cheaper at $2.6M annually, versus $26M for the OTA strategy.", "The Delta Update strategy is cheaper at $260k annually, versus $2.6M for the OTA strategy.", "The Delta Update strategy is cheaper at $5,000 annually, versus $50,000 for the OTA strategy."], "correct_index": 2}}, {"id": "edge-0402", "title": "The TOPS Illusion", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What critical distinction is the PM missing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0403", "title": "The DLA vs GPU Partition", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is your colleague right?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0404", "title": "YOLOv8-S vs ViT-B/16 Latency on Jetson Orin NX", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do you push back against replacing YOLOv8-S with ViT-B/16 on the Jetson Orin NX at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0405", "title": "The Edge Batch Size Paradox", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is this a terrible idea for your use case, and what does the roofline model tell you about batch=1 on edge?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 2}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["\"Batching is essential because it increases NPU utilization. The roofline model shows batch=8 is compute-bound with higher throughput, so we should always use it.\"", "\"Batching introduces about 267ms of temporal staleness before inference even begins, which violates the real-time control requirement. The roofline model should be used to verify batch=1 latency against the <20ms deadline rather than chasing higher utilization.\"", "\"Batch=1 is compute-bound, so it's already optimal. Batching will make it memory-bound and increase latency.\"", "\"Batch=8 reduces inference latency to 0.345ms, so it should be used. The 267ms staleness is acceptable for a 1 m/s robot.\""], "correct_index": 1}}, {"id": "edge-0406", "title": "The Fleet Firmware Fragmentation Crisis", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you safely converge the fragmented fleet to the minimum compatible firmware versions for the new model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0407", "title": "Roofline Inference Latency on Jetson Orin", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the roofline model, what inference latency do you estimate, and is YOLOv8n compute-bound or memory-bound on the Orin NX?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 2}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.063ms, compute-bound", "0.207ms, compute-bound", "0.207ms, memory-bound", "0.063ms, memory-bound"], "correct_index": 2}}, {"id": "edge-0408", "title": "Power Budget for Multi-Model Edge Pipeline", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 25.5W PoE+, what ML power budget remains, and can all three models run concurrently on the Jetson Orin Nano?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0409", "title": "OTA Update Time for Edge Fleet", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Under that staged rollout plan, what is the total fleet update time for the 57 MB update across 500 LTE cameras, and what is the bottleneck?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 1}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0411", "title": "The Sparsity Illusion", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the model still slow, and what pruning strategy should you apply to actually get a speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The pruning is not aggressive enough. Increase unstructured sparsity to 90% or higher.", "The model is memory bandwidth-bound. The pruned model needs to be quantized to INT8.", "The hardware can't skip zero-multiplies. Apply structured pruning to remove entire filters, which reduces the dimensions of the weight tensors and thus the total FLOPs.", "The CPU is the bottleneck. The model processing needs to be offloaded to a dedicated DSP."], "correct_index": 2}}, {"id": "edge-0412", "title": "The Invisible Inventory", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most likely cause of this sudden, targeted failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has drifted due to a change in store lighting, and the 'Glo-Cola' bottle was the first to be affected.", "A recent OTA model update was corrupted, causing it to fail on this specific object class.", "A physical adversarial patch has been placed on the 'Glo-Cola' bottles, making them invisible to the model.", "The camera sensors across the fleet have simultaneously developed a hardware fault that prevents them from seeing the specific red color of the 'Glo-Cola' logo."], "correct_index": 2}}, {"id": "edge-0413", "title": "The Kernel Launch Storm", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most effective first step to solve this performance bottleneck?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 2}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's arithmetic intensity is too low for the hardware; the batch size must be increased to provide more parallel work.", "The model needs to be pruned more aggressively to reduce the total number of FLOPs, as the compute is clearly the bottleneck.", "The system is bottlenecked by kernel launch overhead and DRAM traffic; apply operator fusion to combine sequential operations into single kernels.", "The model is not properly quantized, and using FP16 would provide better hardware mapping on the Tensor Cores."], "correct_index": 2}}, {"id": "edge-0414", "title": "The Real-Time Radar Deadline", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given this data, what is the most direct way to meet the 33ms hard real-time deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has too many G-MACs. Unstructured pruning reduces G-MACs by 80% but does not resolve the 90ms memory stall.", "The accelerator clock speed is insufficient. Overclocking increases TOPS but memory stall remains 90ms.", "The model weight matrix is too large for the on-chip SRAM, causing stalls from memory swapping. Use knowledge distillation to create a smaller student model that fits in SRAM.", "The host CPU is bottlenecking the pipeline. C++ preprocessing saves 5ms but memory stall is still 90ms."], "correct_index": 2}}, {"id": "edge-0415", "title": "The AV Training Bottleneck", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is data parallelism only giving 1.2x speedup, and what parallelism strategy should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The data input pipeline is starving the accelerators. Increase the number of CPU data loader workers.", "The interconnect is insufficient for data parallelism. The hardware is flawed and a faster version (>200 GB/s) is needed.", "The `AllReduce` communication is a serial bottleneck. Switch to Pipeline Parallelism to reduce cross-accelerator data transfer.", "The batch size is too small, leading to low arithmetic intensity. Double the batch size to better saturate the hardware."], "correct_index": 2}}, {"id": "edge-0416", "title": "The Robot's Split Brain", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which interconnect solution will solve the latency problem for the 256 MB transfer, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a 10GbE connection with RDMA to minimize latency.", "Bridge the two modules with an NVLink connector.", "Connect the modules via a PCIe Gen 4 switch.", "Use a standard 10GbE connection between the modules."], "correct_index": 2}}, {"id": "edge-0417", "title": "The Federated TCO Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which strategy has the lower TCO for 1 million cameras at $10/GB data cost, centralized streaming or federated learning, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The centralized approach is cheaper because it saves the company $3 million in upfront hardware costs.", "The federated approach is far cheaper, breaking even on the extra hardware cost in just 30 days due to enormous data transfer savings.", "The federated approach is cheaper, but the breakeven point is over 2 years, making it a risky investment.", "Neither is viable, as the on-device compute power cost for the federated fleet would exceed any potential data savings."], "correct_index": 1}}, {"id": "edge-0420", "title": "The Drone Vision Architecture Debate", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which architecture is the better fit for the Hailo-8, the lightweight ViT or the MobileNetV2-style CNN, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is better because its global self-attention is more powerful than local convolutions.", "The CNN is better because its operations have high spatial locality, matching the dataflow architecture of the accelerator.", "It doesn't matter which model is chosen; the accelerator's compiler will optimize the dataflow for maximum performance.", "The ViT is better because it requires fewer parameters than the CNN."], "correct_index": 1}}, {"id": "edge-0421", "title": "The Fragmented Inference Graph", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What optimization should you apply to get the 45ms model under the 33ms frame deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increase the GPU clock speed and power draw to make the existing kernels run faster.", "Apply structured pruning to the Conv2D layers to reduce the model's total FLOPs.", "Fuse the BatchNorm and Activation layers into their preceding Conv2D operations.", "Convert the entire model to INT8 precision to reduce memory bandwidth pressure."], "correct_index": 2}}, {"id": "edge-0422", "title": "The Drone's 25ms Mystery", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the 67 MB PCIe Gen4 transfer between Orins too slow, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 67 MB payload is too large for the PCIe Gen4 bus, which is saturated. The system requires an interconnect with higher bandwidth like NVLink.", "The LPDDR5 memory on the receiving module (Module B) is the bottleneck; its bandwidth is insufficient to ingest the 67 MB payload from the PCIe bus at speed.", "The transfer is defaulting to a CPU-mediated path instead of using direct peer-to-peer DMA (RDMA), introducing significant OS and driver overhead.", "The system is using a TCP/IP stack over the PCIe bus for communication, and the network protocol overhead is the source of the high latency."], "correct_index": 2}}, {"id": "edge-0423", "title": "The Federated Learning Power Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the incremental energy cost of the FL cameras versus the control group, scaled across a fleet of 10,000 cameras over one year?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0426", "title": "The Hardware-Unaware NAS Penalty", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely reason the NAS-generated model runs slower despite having 20% fewer MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NAS model over-utilized the on-chip SRAM, causing cache misses.", "The NAS algorithm was not run for enough epochs to find a truly optimal model.", "The model's fragmented graph structure prevents operator fusion, incurring significant kernel dispatch overhead on the accelerator.", "The 20% reduction in MACs was not enough to overcome the fixed latency costs of the Hailo-8's architecture."], "correct_index": 2}}, {"id": "edge-0427", "title": "The Sensor Fusion Sync Failure", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a PCIe Gen4 x16 link move the 256 MB feature map within the 33ms frame budget, and how much latency does it add?", "chain_ids": ["edge-chain-auto-secondary-009-21"], "chain_positions": {"edge-chain-auto-secondary-009-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["There must be a driver issue or excessive software overhead; a 32 GB/s link should transfer 256 MB in under 1ms.", "The CPUs orchestrating the DMA transfer are the bottleneck; the PCIe bus itself is not the issue.", "The PCIe bus is the bottleneck. The measured 9ms is close to the theoretical transfer time of ~8.1ms, indicating the link is saturated.", "You should replace the PCIe link with an InfiniBand NDR connection, as it is a higher performance networking fabric."], "correct_index": 2}}, {"id": "edge-0429", "title": "The Perception Model's Performance Ceiling", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary bottleneck keeping the Orin at 20 FPS under the 40W TDP, and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. It has too many operations (2 T-Ops) for the GPU, so we must prune layers or simplify the architecture.", "The Jetson's power management is faulty. It is thermal throttling prematurely, as it should handle 40W without such a large performance drop.", "The model is memory-bandwidth bound. Its low arithmetic intensity means performance is limited by data movement, not compute.", "The bottleneck is the PCIe bus transferring data from the host CPU to the GPU before inference can begin."], "correct_index": 2}}, {"id": "edge-0432", "title": "The Stereo Vision PCIe Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely bottleneck causing the P2P_Transfer stall for the 512x512x256 FP16 feature map over PCIe Gen4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The host CPU is too slow to manage the DMA transfer between the GPUs.", "The InfiniBand network connecting the car's compute unit to the cloud is saturated.", "The GPU compute is insufficient, and the model needs to be quantized or pruned.", "The PCIe bus's peer-to-peer transfer latency is too high for the 134MB feature map, consuming ~8-10ms of the 33ms budget."], "correct_index": 3}}, {"id": "edge-0434", "title": "The AV Perception Latency Puzzle", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the GPU roofline model, what primary bottleneck prevents the 500 G-Ops, 10 GB LiDAR model from reaching 30 FPS on the Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is thermal throttling. The 30W power draw indicates it can't access its full 60W performance budget.", "The model is too computationally complex (500 G-Ops) for the Jetson AGX Orin to execute within the 33ms time budget.", "The model is memory-bound; its low arithmetic intensity (50 Ops/Byte) is below the Orin's ridge point (~1343 Ops/Byte), so performance is limited by memory bandwidth.", "The bottleneck is high software overhead from launching too many small CUDA kernels, which adds significant latency."], "correct_index": 2}}, {"id": "edge-0436", "title": "The Sensor Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What interconnect bottleneck is most likely when moving a 512 MB tensor from GPU A to GPU B within the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server's InfiniBand network card is misconfigured, limiting its bandwidth.", "CPU driver overhead for initiating the async cudaMemcpy call is creating a multi-millisecond scheduling delay.", "The GPUs are communicating over the PCIe bus instead of a direct, high-bandwidth NVLink bridge.", "The cudaMemcpyPeerAsync operation is incorrectly blocking the CPU, which stalls the next perception stage."], "correct_index": 2}}, {"id": "edge-0438", "title": "The Transformer-CNN Resolution Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the most effective first step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Downsample the 1920x1080 camera input back to 1280x720 before feeding it to the ViT.", "The Jetson AGX Orin's 204.8 GB/s memory bandwidth is the bottleneck. The model needs to be moved to a platform with faster on-chip memory.", "Replace the Vision Transformer with an efficient CNN architecture (e.g., based on depthwise separable convolutions) and re-profile performance.", "Apply more aggressive INT4 quantization to the ViT. This should provide a ~2x speedup over INT8, which is enough to meet the deadline."], "correct_index": 2}}, {"id": "edge-0439", "title": "The Sensor Fusion Stall", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural bottleneck explains a 34ms GPU-to-GPU transfer for a 33 MB frame when host-to-GPU takes only 1.7ms?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 2}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand connection between the GPUs is saturated.", "The CPU is overloaded and cannot schedule the DMA transfer efficiently.", "The GPUs are connected over the PCIe bus, which has high overhead for peer-to-peer transfers.", "An inefficient CUDA memcpy call is causing software-level stalls."], "correct_index": 2}}, {"id": "edge-0442", "title": "The Vision Transformer Traffic Jam", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is a CNN a better fit than a ViT for the 33ms camera model on the edge GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT has too many parameters, causing excessive swapping between DRAM and flash storage during inference.", "The model has not been properly quantized to INT8. The FP32 operations are too slow for the hardware.", "The ViT's self-attention mechanism has a low arithmetic intensity, making it memory-bandwidth bound on the architecture.", "The ViT's total FLOPs exceed the 275 TOPS rating of the device, making it compute-bound."], "correct_index": 2}}, {"id": "edge-0443", "title": "The Autonomous Vehicle Interconnect Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can two same-board GPUs still show multi-millisecond transfer stalls, and what interconnect should the platform use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network card connecting the two GPUs is faulty.", "The CPU is bottlenecked scheduling the DMA transfer; upgrade the CPU.", "The GPUs are communicating over the general-purpose PCIe bus, which adds significant protocol overhead. Use NVLink.", "Quantize the model from FP16 to INT8 to cut transfer size."], "correct_index": 2}}, {"id": "edge-0444", "title": "The Federated Fleet's Financial Failure", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most accurate evaluation of the true Total Cost of Ownership (TCO) of the FL driver-alert system, including failure-mode costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The on-device training is consuming too much power, leading to high electricity costs and battery degradation concerns.", "The daily cellular data uploads of model weights are exorbitantly expensive.", "A single model failure event, caused by 'straggler bias' inherent to the FL approach, cost more than all other operational expenses.", "The hardware is inefficient; choosing a lower-power edge accelerator would have significantly reduced costs."], "correct_index": 2}}, {"id": "edge-0445", "title": "The Real-Time Vision Stall", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given this profile, what is the most effective optimization to consistently meet the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Aggressively prune more filters from the convolutional layers to reduce the model's overall FLOPs.", "Increase the GPU's clock frequency to reduce the execution time of each individual kernel.", "Apply operator fusion using a graph compiler like TensorRT to combine sequential layers into single kernels.", "Increase the number of CPU worker threads in the data loading pipeline to feed the GPU faster."], "correct_index": 2}}, {"id": "edge-0446", "title": "The Federated Fleet Bottleneck", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What bottleneck is making FL training slow when 50M-parameter gradients are uploaded over 10 Mbps LTE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is compute-bound; the vehicles' accelerators are too slow to compute gradients on a 50M parameter model, causing the server to wait.", "The system is communication-bound; uploading the ~200MB FP32 gradient payload over the 10 Mbps link is the dominant time sink.", "The central server is the bottleneck; it cannot aggregate gradients from 1,000 vehicles simultaneously and requires a more scalable architecture.", "The system is memory-bound; the 50M parameter model is too large for the vehicles' local RAM, causing excessive page swapping to disk."], "correct_index": 1}}, {"id": "edge-0447", "title": "The Self-Driving Stall", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the roofline model, is the 24.4ms Orin perception model compute-bound or memory-bound, and how should you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. To improve performance, we must prune the model to reduce its 4 Trillion Ops.", "The Jetson platform is inadequate. We must switch to a platform with higher memory bandwidth to meet the 20ms deadline.", "The model is memory-bound. We must use operator fusion or more aggressive quantization to improve its arithmetic intensity.", "The bottleneck is power; the Orin is likely being thermally throttled. We should improve the cooling solution."], "correct_index": 2}}, {"id": "edge-0448", "title": "The Driver Monitoring Latency Budget", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Under a 15W budget and 8ms fixed overhead, which model meets the 33ms deadline: Efficient-CNN or Small-ViT?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0450", "title": "The Federated Fleet ROI Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which strategy is cheaper on first-year TCO, and which strategy is safer if raw-data privacy is a hard requirement for the 1M-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized Training, because its annual TCO of ~$194,000 is significantly lower than the Federated Learning strategy.", "Federated Learning, because its annual data transfer cost of $39,600 is far lower than the Centralized approach.", "Centralized is cheaper on Year 1 TCO (~$194,000 vs ~$357,600), but Federated Learning is safer when raw-data privacy is a hard requirement because it avoids transmitting sensitive sensor data.", "Centralized Training, as the 400TB of raw data can be easily anonymized before upload, which removes the privacy concern and makes the cheaper option viable."], "correct_index": 2}}, {"id": "edge-0453", "title": "The Real-Time Driver Monitoring Dilemma", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you solve this problem and ship both features while staying within a hard 33ms real-time budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Replace the CNN with a more expressive Vision Transformer (ViT) architecture to learn both tasks simultaneously.", "Run Neural Architecture Search (NAS) to automatically find a single, more efficient architecture for both tasks.", "Implement a Mixture-of-Experts (MoE) architecture with a gating network that routes inputs to a specialized 'drowsiness' or 'distraction' expert.", "Apply 8-bit quantization. Since the model is compute-bound, reducing the data size will not help latency."], "correct_index": 2}}, {"id": "edge-0454", "title": "The Autonomous Vehicle Scaling Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 256 MB GPU-to-GPU transfer over 100 Gbps InfiniBand taking 30ms, and what architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand link is saturated. We must upgrade to a 400 Gbps NDR link to increase bandwidth.", "The bottleneck is software overhead in the RDMA drivers. We need to write a custom kernel module to reduce latency.", "The use of an inter-node fabric (InfiniBand) for this tightly-coupled task is the bottleneck. The GPUs must be co-located in a single node with a supported high-bandwidth intra-node interconnect such as NVLink or NVSwitch.", "The PCIe bus on the host machine is saturated by other peripherals. We need to rearrange the PCIe cards."], "correct_index": 2}}, {"id": "edge-0455", "title": "The Over-Budget Fleet Update", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does the proposed federated learning schedule fit the $30,000/month cellular budget, and what is its monthly data cost?", "chain_ids": ["edge-chain-auto-017-03"], "chain_positions": {"edge-chain-auto-017-03": 2}, "chain_tiers": {"edge-chain-auto-017-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$8,000. The plan only requires 1,000 GB of upload data, which is well within budget.", "$1,200. The data transfer per communication round is only 150 GB.", "$48,000. The plan requires 6,000 GB of data transfer per month at $8/GB, exceeding the $30,000 budget.", "$9,600. The plan requires 1,200 GB of data transfer per month."], "correct_index": 2}}, {"id": "edge-0456", "title": "The Autonomous Perception Latency Puzzle", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which parallelism strategy should you use across the two accelerators to get under 33ms, tensor or pipeline parallelism?", "chain_ids": ["edge-chain-auto-secondary-016-09"], "chain_positions": {"edge-chain-auto-secondary-016-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Pipeline parallelism is better, with a latency of 20.5ms.", "Tensor parallelism is better, with an effective latency of 24ms.", "Pipeline parallelism is the only option, but it fails with a latency of 40.5ms.", "Neither strategy works; tensor parallelism is too slow at 34ms because of communication overhead."], "correct_index": 1}}, {"id": "edge-0457", "title": "The ADAS Pipeline Frame Drop", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, is the 22ms custom LiDAR layer compute-bound or memory-bound on the 15W Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; 1.5 G-Ops is too much work for the 15W power profile. Increasing the Orin's power budget to 60W is the only solution.", "The layer's performance is limited by PCIe bus contention with other peripherals in the ADAS system.", "The layer is memory-bound. Its arithmetic intensity of 1.875 Ops/Byte is far below the Orin's roofline, and the 22ms latency is driven by poor memory access patterns given the theoretical minimum time of ~7.8ms to read 800 MB over the 102.4 GB/s bus.", "The layer is compute-bound. The INT8 operations are likely not mapping efficiently to the Orin's Tensor Cores, leading to low TOPS utilization. A custom CUDA kernel is needed."], "correct_index": 2}}, {"id": "edge-0458", "title": "The Drone's Dilemma: Compute vs. Bandwidth", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the INT8 EdgeViT miss the 33ms deadline despite its G-Op count seeming feasible on the Orin?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The EdgeViT's G-Op count is misleading; its self-attention operations are inherently more complex and take more clock cycles per OP on the Jetson's architecture.", "The EdgeViT is likely causing the Jetson to exceed its thermal design power (TDP), forcing the chip to throttle its clock speed and increase latency.", "The EdgeViT has a lower Arithmetic Intensity (AI) due to its memory access patterns, making it memory-bandwidth bound on the Jetson's 205 GB/s DRAM interface.", "The CUDA kernels for Transformer operations are not as mature as CNN kernels, leading to significant software overhead from things like kernel launch latency."], "correct_index": 2}}, {"id": "edge-0459", "title": "The Autonomous Vehicle Interconnect Dilemma", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Should you transfer the 128 MB feature map between the two SoCs over PCIe Gen4 x16 or 100 Gbps Ethernet with RDMA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Choose 100 Gbps Ethernet because RDMA provides a zero-copy transfer that is more efficient than the standard PCIe driver stack.", "Choose 100 Gbps Ethernet because it is the standard for low-latency GPU communication in high-performance clusters.", "Choose PCIe. Its raw bandwidth is higher and its architecture as a direct memory bus results in lower CPU overhead and less latency jitter.", "Neither is sufficient. The true bottleneck is the MIPI camera interface, which cannot supply data fast enough."], "correct_index": 2}}, {"id": "edge-0460", "title": "The Fleet vs. The Cloud: Drowsiness Detection TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which plan is more cost-effective for updating the drowsiness model across 500,000 vehicles, centralized telemetry or federated learning?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Plan A is more economical. The 10 MB weekly updates in Plan B are significantly larger than the video clips, making it more expensive in the long run.", "Plan B is more economical, with an annual cost of ~$24K compared to Plan A's ~$360K, but the difference is modest enough to prefer Plan A for simplicity.", "Plan B is overwhelmingly more economical. At $0.10/GB, it saves ~$336K/year; at enterprise cellular rates ($10/GB), the gap widens to ~$30M/year, making centralized collection financially infeasible.", "Both plans cost roughly the same (~$300K/year). The choice depends on model performance needs, not cost."], "correct_index": 2}}, {"id": "edge-0461", "title": "The ADAS Pipeline Stall", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should the Hailo inference and CPU fusion stages run serially or as a pipeline to meet the 33ms AEB frame deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0462", "title": "The Autonomous Driving Stall", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the root cause of the depthwise convolution bottleneck on the Jetson AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; the Jetson's 275 TOPS is insufficient for the 115.6M operations required.", "The device is power-throttling due to the high operation count, which is reducing the effective TOPS.", "The layer is memory-bound; its arithmetic intensity is far below the hardware's ridge point, so performance is limited by memory bandwidth.", "The bottleneck is the MIPI CSI-2 camera interface, which cannot supply data to the model fast enough."], "correct_index": 2}}, {"id": "edge-0463", "title": "The Edge Roofline Dilemma: CNN vs. ViT", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you decide whether the ViT can replace the optimized CNN while still meeting the 33ms deadline on Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is compute-bound. Its latency will be ~27ms (7.5 T-Ops / 275 TOPS), so it is a feasible replacement.", "The ViT has ~1.8x more Ops, so it will be ~1.8x slower. Its latency will be 24ms * 1.8 = ~43ms, missing the deadline.", "The ViT is memory-bound. Its latency will be ~73ms, violating the 33ms deadline.", "Both models are compute-bound, but the ViT has more Ops. Its latency will be ~27ms which is too close to the deadline, so it is not feasible."], "correct_index": 2}}, {"id": "edge-0464", "title": "The Autonomous Data Jam", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the 25ms CPU-to-GPU cudaMemcpy slowdown despite fast NVLink GPU-to-GPU transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The host CPU is too slow to stage the 10 GB/s data stream, causing a data preparation bottleneck before the transfer begins.", "The NVLink bridge is misconfigured, and the CPU-to-GPU communication is secretly falling back to a slower interconnect path.", "The InfiniBand card used for data logging is saturating the PCIe bus, leaving insufficient bandwidth for the CPU-to-GPU data transfer.", "The CUDA driver has high overhead, adding fixed latency to every transfer operation regardless of PCIe bandwidth."], "correct_index": 2}}, {"id": "edge-0465", "title": "The Fleet-Learning Cost Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which proposal is cheaper for improving rare near-miss recall across 100,000 vehicles, centralized clip collection or federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because the cloud ingress cost of $0.02/GB guarantees a cost fraction 500x cheaper than the cellular reimbursement.", "Federated Learning, because its $76,000 annual update cost is far below the centralized upload-plus-labeling cost of about $3,003,600.", "Centralized, because avoiding the $50,000 annual baseline OPEX for FL servers ensures centralized maintains higher margins.", "Federated Learning, but the fiscal benefits remain trivial due to the offset between labeling costs and the steep $260k data transfer tax."], "correct_index": 1}}, {"id": "edge-0466", "title": "The Sensor Fusion Bottleneck: 3D Parallelism", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you parallelize the independent camera branches to get the multi-camera model below the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use data parallelism: process frame N on stream 1 and frame N+1 on stream 2 to increase throughput.", "Use tensor parallelism: split the layers of the fusion head to parallelize its computation.", "Use model parallelism: run the front camera backbone and the side camera backbone simultaneously on different streams, then fuse.", "Use pipeline parallelism: assign the front camera backbone to early layers, and the side camera backbone to later layers."], "correct_index": 2}}, {"id": "edge-0467", "title": "The Autonomous Driving Roofline Puzzle", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At the 40W power limit, is the LiDAR segmentation model compute-bound or memory-bandwidth-bound on Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. Its arithmetic intensity of 898 Ops/Byte is very high, meaning the 184 TOPS of work is too much.", "The model is primarily limited by thermal throttling. The 40W power budget only allows for ~184 TOPS, completely restricting execution.", "The model is memory-bound. Its arithmetic intensity of 600 Ops/Byte is below the Jetson's power-constrained ridge point of ~898 Ops/Byte, so performance is dictated by memory bandwidth.", "The bottleneck is inefficient kernel execution. The model AI of 600 Ops/Byte implies 100% compute utilization."], "correct_index": 2}}, {"id": "edge-0468", "title": "The Transformer's Traffic Jam", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is MicroViT slower than the CNN on Jetson Orin despite having similar computations?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0469", "title": "The Sensor Fusion Stutter", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on a napkin math calculation, what is the most likely cause of the ~33ms 'dark time' latency overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The OS scheduler overhead for managing two GPUs is causing context switching delays.", "The PCIe bus is saturated; transferring 2 GB at the given 60 GB/s bandwidth takes ~33ms.", "The sensor ingress pipeline is using slow Ethernet, creating a data input bottleneck before the GPUs are even used.", "The model is memory-bound, and the 33ms is due to slow HBM access on the GPUs."], "correct_index": 1}}, {"id": "edge-0470", "title": "The Driver Monitoring TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which update strategy has the lower 2-year TCO for quarterly updates across 100,000 vehicles, centralized or federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because the $10M one-time hardware cost for the fleet is prohibitively expensive.", "Centralized, because the $400,000 cloud training cost is much cheaper than the $100 per-unit BOM increase.", "Federated, because the $10M one-time hardware cost is significantly less than the $16.4M total cost of data uploads and training over 2 years.", "Federated, because performing on-device training consumes less total energy than cloud training."], "correct_index": 2}}, {"id": "edge-0471", "title": "The Autonomous Driving Perception Stall: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, is the 5 trillion-op, 50 GB LiDAR model on Orin compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound; its 5 Trillion operations are too much for the AGX Orin's GPU.", "The device is power-throttling, as the AGX Orin cannot sustain the required TOPS/W in its 15W power mode.", "The model is memory-bound; its low arithmetic intensity means performance is limited by memory bandwidth, not compute.", "The MIPI CSI-2 camera interface is the bottleneck, as it cannot feed frames to the system fast enough."], "correct_index": 2}}, {"id": "edge-0472", "title": "The Autonomous Driving Frame Drop", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you mathematically evaluate this ViT proposal for edge deployment, and what is a superior architectural optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Replace the ResNet-50 backbone with a small Vision Transformer (ViT) of similar GFLOPs.", "Launch a Neural Architecture Search (NAS) to discover an optimal architecture for the edge hardware.", "Replace the standard 3x3 convolutions with depthwise separable convolutions.", "Keep the architecture but apply aggressive INT4 quantization to the convolutional layers."], "correct_index": 2}}, {"id": "edge-0473", "title": "The Autonomous Truck's Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which interconnect is feasible for a 512 MB in-chassis SoC-to-SoC transfer within the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use 10 Gbps Automotive Ethernet. It is an industry standard and the simplest to implement.", "Use PCIe Gen4 x8. The transfer time of ~32.5ms fits under 33ms for the copy alone.", "Redesign around a board-level PCIe Gen5 x16 DMA link. The ~8.1ms transfer time leaves real-time headroom.", "Use 25Gbps Automotive Ethernet. It is more realistic than datacenter fabric but still too slow for this payload."], "correct_index": 2}}, {"id": "edge-0474", "title": "The TCO of Privacy on Wheels", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which privacy-preserving strategy should you choose for 500,000 vehicles after comparing 3-year TCO and breach risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0475", "title": "The Autonomous Driving FPS Mystery", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the roofline model, what bottleneck explains 12 FPS despite 95% GPU utilization for the 40 GFLOP, 16 GB LiDAR model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. 95% GPU utilization indicates the compute units are saturated and we need a more powerful GPU to reach 20 FPS.", "The device is thermal throttling, lowering its clock frequency and missing the FPS target.", "The model is memory-bandwidth bound. Its arithmetic intensity is too low for the architecture, causing the GPU to stall while waiting for data.", "The model is PCIe-bandwidth bound. The 16 GB of data cannot be transferred over the bus fast enough to support 20 FPS."], "correct_index": 2}}, {"id": "edge-0476", "title": "The Perception Deadline Miss", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 5M-parameter ViT take 100ms when the 5M-parameter MobileNetV2 detector takes 25ms, despite nearly identical parameter counts?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 2}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT has poor cache locality and is overwhelming the memory bandwidth. We should reduce the model's embedding dimension.", "The 5M parameters in the ViT are stored in FP32, while the CNN's were INT8, leading to a 4x larger memory footprint that is causing swapping to flash storage.", "The ViT's $O(N^2)$ compute scaling in its attention layers creates a massive FLOPs burden compared to the CNN's linear scaling, making it compute-bound. We should use NAS to find an efficient architecture.", "This is a training-serving skew issue. According to Chinchilla scaling laws, the ViT was undertrained, and we need to increase the training dataset size by 20x."], "correct_index": 2}}, {"id": "edge-0477", "title": "The Autonomous Vehicle Perception Stall", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What bottleneck is causing the missing ~20ms and 33% GPU utilization when copying eight 16MB camera streams to the Orin GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU LPDDR5 memory bandwidth is saturated by the vision transformer's attention heads.", "The PCIe bus host-bounce latency (16.2ms) is starving the compute cores.", "The model's 10ms compute time is causing thermal throttling, downclocking the PCIe bus.", "The 128MB transfer directly saturates the InfiniBand networking stack."], "correct_index": 1}}, {"id": "edge-0478", "title": "The Secure Fleet TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is adding cryptographic hash commitments to FL updates economically justified for the 10,000-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the annual operational cost is over $1,369,000, which far exceeds the risk-adjusted loss.", "No, the potential $1,000,000 loss is a 'black swan' event; the certain $1,095 annual energy cost should be avoided.", "Yes, the ~$1,100 annual energy cost is significantly lower than the expected loss from a potential model poisoning attack.", "Yes, but only because the $1,095 cost is less than the $7,300 hardware replacement cost."], "correct_index": 2}}, {"id": "edge-0479", "title": "The Autonomous Driving Perception Bottleneck", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, is the depthwise separable convolution layer on Orin compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound because 500 Million OPs is too much computation for a single layer on an edge device.", "The layer is memory-bound because its arithmetic intensity (10 Ops/Byte) is far below the hardware's ridge point (~1,342 Ops/Byte).", "The layer is thermally limited; the AGX Orin is likely throttling and unable to deliver its peak 275 TOPS.", "The layer is compute-bound because its low arithmetic intensity means it isn't doing enough work per byte to be efficient."], "correct_index": 1}}, {"id": "edge-0480", "title": "The Vision Transformer Deadline Miss", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 640x640 INT8 ViT on Orin, are the self-attention layers compute-bound or memory-bandwidth-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT's 16 GOPS are too high for the hardware, making it compute-bound. The MobileNet's 0.5 GOPS are a better fit for the available compute.", "The low GPU utilization indicates a CPU bottleneck in the data preprocessing pipeline; the model choice is irrelevant until the data loading is fixed.", "The ViT is memory-bound, confirmed by its low Arithmetic Intensity (100 Ops/Byte). Switching to MobileNet, which moves 10x less data, should reduce latency by ~10x to ~4.5 ms.", "Both models are memory-bound, but the MobileNet has a lower Arithmetic Intensity (~17 Ops/Byte), which means it will be even more memory-bound and thus slower than the ViT."], "correct_index": 1}}, {"id": "edge-0482", "title": "The Federated Fleet's Budget Blowout", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the primary driver of the 5x daily cellular data cost increase in the 10,000-vehicle FL A/B test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 25% increase in model size is the primary cause, as larger binaries always have proportionally higher transfer costs.", "The new model is uploading raw sensor data logs for debugging, which is a standard practice in A/B tests for safety-critical systems.", "The 4x increase in communication frequency is the dominant cost driver, creating a 5x multiplicative effect when combined with the larger model size.", "On-device training for the new model is more compute-intensive, causing the cellular modem to draw more power and transmit more telemetry, leading to higher data charges."], "correct_index": 2}}, {"id": "edge-0484", "title": "The Drone's Dilemma: Transformer vs. CNN Latency", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Is the manager correct to suggest halving the patch size to 8x8 to speed up the ViT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The manager is wrong. The quadratic scaling of attention with the increased sequence length will cause a catastrophic increase in latency. The team should use the MobileNet-L.", "The manager is right. Smaller patches can be processed in parallel, and the Jetson Orin's high TOPS count can handle the increased workload within the deadline.", "The ViT has too many parameters. The latency issue can be solved by pruning 50% of the weights, which will cut the 50ms latency in half to 25ms.", "The issue is memory bandwidth, not compute. Using smaller patches will increase cache hits and could potentially reduce latency."], "correct_index": 0}}, {"id": "edge-0485", "title": "The Autonomous Vehicle Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the 512 MB GPU-to-GPU feature-map transfer over 32 GB/s PCIe Gen5 the bottleneck, and would NVLink fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is bottlenecked; it doesn't have enough PCIe lanes to feed both GPUs, causing the stall.", "The problem is the network; InfiniBand is needed to get sensor data to the compute unit faster.", "The 16ms PCIe transfer time consumes nearly 50% of the 33ms real-time budget, making it the bottleneck. A faster interconnect is required.", "The issue is software; the data transfer should be optimized by using RDMA to bypass the CPU."], "correct_index": 2}}, {"id": "edge-0486", "title": "The Federated Fleet Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the projected annual data transmission cost for 1 million vehicles, and is the full rollout economically feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$891k per year. The cost for the 50,000-vehicle pilot is well under budget, so the rollout is approved.", "The primary cost is the on-device power consumption from daily training, not data transfer. The feature is feasible if battery drain is acceptable.", "~$17.8M per year. The feature is not economically viable without significant cost optimization.", "~$48,828 per year. The feature is extremely cheap because federated learning minimizes data transfer compared to sending raw sensor logs."], "correct_index": 2}}, {"id": "edge-0487", "title": "Jetson Orin Roofline Comparison for 3D Detection", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you compare the two 3D detection models for performance per watt on the 15W Jetson AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0488", "title": "The Utilization Paradox", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which candidate model meets the 13.3ms latency budget once Orin utilization is accounted for, the CNN or the ViT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B (ViT) is the only choice. Its higher hardware utilization results in a latency of ~11ms, which is within the 13.3ms budget, while the CNN is too slow at ~14.5ms.", "Model A (CNN) is the clear choice. With only 300 G-MACs, it is 2.5x smaller and takes ~2.2ms, easily meeting the deadline.", "Neither model is viable. The CNN will take ~14.5ms and the ViT will take over 30ms. Both fail the 13.3ms budget.", "Both models are viable. The CNN takes ~2.2ms and the ViT takes ~5.4ms, both well within the total 33.3ms budget."], "correct_index": 0}}, {"id": "edge-0490", "title": "The Fleet Update Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which update strategy has a lower annual TCO for fixing e-scooter detection across 100,000 vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Strategy A, because the $5M platform fee for Strategy B is far greater than the $800k annual training cost for Strategy A.", "Strategy A, because uploading raw data allows for more powerful, unbiased centralized models, which justifies the higher but manageable data cost.", "Strategy B, because the annual data transfer cost is dramatically lower, making the total cost (~$5.7M) significantly less than Strategy A (~$73.8M).", "The costs are roughly equivalent; the high license fee of Strategy B cancels out the data savings, so the decision should be based on privacy alone."], "correct_index": 2}}, {"id": "edge-0492", "title": "The Edge Transformer Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Should the ResNet-34 replacement be a Vision Transformer or an efficient CNN to maintain 30 FPS on Jetson Orin?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0495", "title": "The Vision Transformer Performance Cliff", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the ViT underperform the MobileNet-style CNN on Orin despite having fewer parameters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT model has a bug causing excessive memory leaks.", "The ViT's total FLOP count (8 GFLOPs) is simply too high for the Jetson AGX Orin's compute capacity.", "The ViT has a much lower Arithmetic Intensity, making it severely memory-bandwidth bound on the Jetson hardware.", "The ViT has fewer parameters, which means it cannot take full advantage of the GPU's parallelism."], "correct_index": 2}}, {"id": "edge-0496", "title": "The Autonomous Vehicle PCIe Stall", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the 42ms camera-to-GPU latency regression, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated. Moving 200MB is too slow and the system requires an upgrade to a wider bus or compression.", "The GPU's HBM3 memory controller is overwhelmed, causing backpressure on the PCIe bus and slowing down the data ingress.", "The data is in pageable host memory, forcing a hidden CPU-bound copy into a pinned staging buffer before the GPU DMA transfer can start.", "The system's NVLink switch is congested with model-parallel traffic, interfering with the PCIe controller and adding queueing delay."], "correct_index": 2}}, {"id": "edge-0497", "title": "The Delivery Robot's Perception Stall", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the LiDAR workload compute-bound or memory-bound, and what should you optimize to reach 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The GPU is at 100% utilization, which means the compute units are fully saturated and are the bottleneck.", "Compute-bound. The model requires 40 G-IOPs, which is a heavy computational load that is challenging for an edge device.", "Memory-bound. The workload's arithmetic intensity (~381 Ops/Byte) is significantly below the hardware ridge point (~1,342 Ops/Byte), so performance is limited by memory bandwidth.", "Memory-bound. The 105 MB parameter footprint exceeds the L2 cache, forcing all operations to spill to SSD storage."], "correct_index": 2}}, {"id": "edge-0498", "title": "The Drone's Dropped Frames", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using arithmetic intensity, why is the ViT bottlenecked on the Orin and why would MobileNetV3 likely be faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is memory-bandwidth-bound due to the low arithmetic intensity of attention; the CNN's higher arithmetic intensity makes it compute-bound and a better fit for the Orin's architecture.", "The GPU is at 100%, so the system is thermally throttling. A new model won't help until we add a bigger heatsink.", "The ViT model is too large and is causing slow data swapping from NVMe storage, which is the bottleneck.", "The model has too many layers. Reducing the depth of the ViT is the only way to meet the latency target, as model family doesn't matter."], "correct_index": 0}}, {"id": "edge-0499", "title": "The Autonomous Vehicle DMA Bounce", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 256 MB GPU-A to GPU-B transfer taking about 20ms instead of the expected 8ms, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen4 x16 bus (32 GB/s) lacks the necessary bandwidth for a 256 MB transfer, which requires an upgrade to PCIe Gen5.", "The transfer is taking 20ms because the data is being routed through the 1 Gigabit Ethernet network switch instead of the PCIe bus.", "The transfer is bouncing through CPU host memory because peer-to-peer DMA is not enabled, effectively doubling the bus traffic and overhead.", "The NVLink bridge connecting the GPUs is faulty, forcing a fallback to the much slower PCIe bus for the transfer."], "correct_index": 2}}, {"id": "edge-0500", "title": "The Fleet Learning TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the first-year TCO of the centralized A/B pipeline versus Federated Learning, and when does FL become cheaper?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The A/B Telemetry approach is more expensive because daily data uploads from 10,000 cars create a higher server load and data ingress cost than weekly uploads from the FL fleet.", "The Federated Learning approach is more expensive, primarily because the total data volume from sending 5MB model updates across 100,000 cars significantly exceeds the telemetry data volume.", "The Federated Learning approach has a much higher TCO in Year 1, driven predominantly by the large, one-time engineering investment required to build the secure aggregation infrastructure.", "The on-device compute for federated training on each car's ECU consumes significant power, making the fleet-wide energy cost the largest component of TCO for the FL approach."], "correct_index": 2}}, {"id": "edge-0502", "title": "The Robot's Perception Power Budget", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which model should you choose under the 15W, 30 FPS budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B is the better choice because its calculated latency (1.5 GB / 204.8 GB/s ≈ 7.3ms) is faster, providing a larger safety margin within the 33ms budget.", "Neither model is viable, as their compute requirements (500-800 GOPS) are too high for an edge device.", "Model A is the only choice because its compute-bound nature makes it far more power-efficient, whereas Model B's memory-bound nature would cause it to exceed the 15W power budget.", "Both models are viable, but Model A is preferable because compute-bound workloads are easier to optimize with techniques like kernel fusion."], "correct_index": 2}}, {"id": "edge-0503", "title": "The Autonomous Driving Latency Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the suggestion to use InfiniBand likely flawed, and what is the actual bottleneck?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The suggestion is correct; InfiniBand's lower latency and higher bandwidth are necessary for real-time sensor streams.", "The board should be redesigned to use NVLink instead of PCIe to connect the sensor card, as NVLink is a faster GPU interconnect.", "The suggestion is wrong because InfiniBand is a datacenter interconnect; the true bottleneck is likely a 'double copy' issue, which should be solved with a zero-copy protocol like GPUDirect RDMA.", "The CPU is too slow to handle the DMA programming for PCIe. The system needs a CPU upgrade to reduce scheduling latency."], "correct_index": 2}}, {"id": "edge-0504", "title": "The Federated Fleet TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which pilot is cheaper on direct data cost, centralized user collection or Federated Learning, and how should you calculate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$25,000. The federated approach is significantly cheaper as the cost is dominated by a single model broadcast.", "$25,000. The federated approach is more expensive because transmitting raw voice data from 10,000 cars is costly.", "$125,000. The federated approach is cheaper than the $1,250,000 centralized raw-audio collection.", "The costs are comparable, so the choice depends on implementation complexity rather than TCO."], "correct_index": 2}}, {"id": "edge-0506", "title": "The Self-Attention Latency Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which listed strategy is the best latency fix while preserving enough obstacle-detection quality for deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply aggressive INT4 quantization to the existing ViT.", "Use Neural Architecture Search (NAS) to find a smaller ViT configuration.", "Replace the ViT with a CNN built using depthwise separable convolutions.", "Implement a Mixture-of-Experts (MoE) layer."], "correct_index": 2}}, {"id": "edge-0508", "title": "The Wake-Word ROI Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the 100,000-car wake-word fleet, with 10,000 cars participating in FL, what is the first-year TCO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The FL data transfer alone would exceed $10 million annually.", "The FL first-year TCO is roughly $520 because only bandwidth should be counted.", "The FL plan has a first-year TCO of $50,520.", "The FL approach is the cheapest option as it only costs $520 for the entire year."], "correct_index": 2}}, {"id": "edge-0510", "title": "The Self-Attention Traffic Jam", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the most effective change to meet the deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply INT8 quantization to the Vision Transformer and keep the architecture.", "Increase the Jetson Orin's power mode from 15W to 60W to increase TOPS.", "Replace the ViT with a MobileNet-style CNN using depthwise separable convolutions.", "Increase the batch size to better saturate the GPU cores."], "correct_index": 2}}, {"id": "edge-0512", "title": "The Fleet vs. The Cloud: A TCO Showdown", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which daily update strategy has lower operational cost for the 100,000-vehicle fleet, centralized upload or Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Strategy A is cheaper because the cloud compute cost for federated learning will grow unpredictably with more users.", "Strategy B is cheaper, but the main savings come from the lower cloud compute cost.", "Strategy B is cheaper primarily due to the ~50x reduction in cellular data transfer costs.", "The costs are roughly equivalent; the privacy benefits of Strategy B are the only real differentiator."], "correct_index": 2}}, {"id": "edge-0515", "title": "The PCIe Latency Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 64 MB PCIe transfer taking 28ms instead of 8ms, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe bus is sharing bandwidth with the GPU, which is also accessing system DRAM. This contention is saturating the bus.", "The MIPI CSI-2 camera interfaces are the real bottleneck and are failing to deliver data to system memory fast enough.", "The driver is using small, non-coalesced memory transfers, making the operation bound by PCIe's transaction latency rather than its bandwidth.", "The effective sustained bandwidth of the PCIe bus is much lower than its theoretical peak, likely around 2.3 GB/s due to hardware limitations."], "correct_index": 2}}, {"id": "edge-0517", "title": "The Vision Transformer Performance Regression", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the new ViT slower than the old CNN on Jetson Orin despite having 20% fewer G-MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The G-MAC count is misleading; Transformer MACs are less efficient and take longer to execute on the Orin's Tensor Cores.", "The ViT is memory-bandwidth bound; its low arithmetic intensity means the compute units are starved waiting for data from DRAM.", "The model has too many parameters, causing it to be capacity-bound by the Orin's 8-32GB of DRAM.", "The performance drop is due to insufficient quantization; the model needs to be converted to INT4 to increase throughput."], "correct_index": 1}}, {"id": "edge-0518", "title": "The Sensor Fusion Traffic Jam", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 1.5 GB feature-map transfer over PCIe Gen4 x16 fit within a 33ms fusion budget, and what is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is starving the GPU during sensor data ingestion.", "The PCIe bus is the bottleneck.", "The issue is high PCIe protocol overhead.", "NVLink is designed for training, not inference."], "correct_index": 1}}, {"id": "edge-0519", "title": "The Fleet vs. Federated TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do the annual data-transfer costs of centralized video upload and federated learning compare for a fleet of 10,000 vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized Learning. The annual cost is only about $27,000, a reasonable price for the high-quality raw data needed for debugging.", "Centralized Learning. The on-device compute required for Federated Learning would add significant thermal load and power consumption, outweighing any data cost savings.", "Federated Learning. The annual data cost is ~$7,300 versus ~$274k for the centralized approach, making it ~40x cheaper while protecting user privacy.", "Federated Learning is not viable. Uploading a 1 MB gradient file over a cellular network will introduce too much latency and miss the vehicle's real-time processing deadlines."], "correct_index": 2}}, {"id": "edge-0520", "title": "The LiDAR Perception Bottleneck: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 1.2 TOps, 4 GB LiDAR model compute-bound or memory-bound on the 30W Orin, and what should you optimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. The 30W power cap enforces thermal throttling, blocking the peak 275 TOPS ceiling.", "The model is CPU-bound. The slow ARM fabric is incapable of staging the point clouds fast enough for the Ampere architecture.", "The model is memory-bound. Its arithmetic intensity of 300 Ops/Byte is below the Orin's ~732 Ops/Byte ridge point, so reducing DRAM traffic is the priority.", "The model is strictly compute-bound. Moving 1.2 Trillion analytical ops forces heavy queueing logic on edge tensor cores."], "correct_index": 2}}, {"id": "edge-0521", "title": "The Self-Driving Stall: Diagnosing a Memory-Bound Vision Transformer", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the ViT missing the 30ms deadline with 25% GPU utilization but near-100% memory-controller load on the edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT has too many operations for the hardware, making it compute-bound; switching to a smaller CNN would solve the issue because peak TOPS is the limiting resource.", "The low GPU utilization indicates a CPU bottleneck in the data preprocessing pipeline; the model choice is irrelevant until the data loading is fixed.", "The ViT is memory-bound, confirmed by its low Arithmetic Intensity (100 Ops/Byte) versus the ~1342 Ops/Byte ridge point; a replacement model must reduce memory traffic or increase locality, not just reduce FLOPs.", "Both models are memory-bound, but the MobileNet has a lower Arithmetic Intensity (~17 Ops/Byte), which means it will be even more memory-bound and thus slower than the ViT."], "correct_index": 2}}, {"id": "edge-0523", "title": "The Automotive Privacy-TCO Tug-of-War", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which strategy is superior under raw-video rules: centralized upload with later anonymization or Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0525", "title": "The Federated Fine-Tuning Payback Period", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is Federated Learning economically feasible for the 10,000-robot reflective-sign problem compared with cloud collection and manual labeling?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Manual labeling is cheaper. Running training on 1,000 devices for 3 months will incur massive energy and hardware degradation costs.", "Manual labeling is cheaper. A team of 5 annotators can label the required images in a few weeks.", "Federated Learning is drastically cheaper. The total energy cost is less than $100, whereas the manual labeling cost is $10,000 daily.", "Neither is viable. The security risk of a model poisoning attack during Federated Learning is too high for a safety-critical system."], "correct_index": 2}}, {"id": "edge-0527", "title": "The GPU Power Gating Latency", "topic": "safety-certification", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did GPU power gating increase the initial detection latency to 180ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0528", "title": "The Watchdog Timeout Freeze", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What OS-level event stalled your script for over 200ms?", "chain_ids": ["edge-chain-auto-secondary-001-06"], "chain_positions": {"edge-chain-auto-secondary-001-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0529", "title": "The UVC Camera MJPEG CPU Tax", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did saving USB bandwidth destroy your CPU performance and slow down inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0530", "title": "Real-Time Kernel Scheduling Isolation for ML Throughput", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the real-time kernel configuration cut ML throughput, and how do you preserve motor-control latency without starving perception?", "chain_ids": ["edge-chain-auto-001-09"], "chain_positions": {"edge-chain-auto-001-09": 1}, "chain_tiers": {"edge-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0532", "title": "The Tracker Addition Budget", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the estimate 20ms + 12ms = 32ms dangerously wrong for fitting within the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0534", "title": "The CAN Bus Bandwidth Crunch", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much effective bandwidth does CAN 2.0B provide, and what share is your 30 Hz ML telemetry consuming?", "chain_ids": ["edge-chain-auto-secondary-009-20"], "chain_positions": {"edge-chain-auto-secondary-009-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0535", "title": "The GPU Driver Crash Recovery", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you maintain perception during a GPU driver crash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "The compiler should always fuse all possible operations; selective fusion never outperforms maximum fusion.", "You need a perception fallback that doesn't depend on the GPU. The Orin has multiple independent compute engines: (1) DLA (Deep Learning Accelerator) — has its own driver stack, independent of the GPU driver. Pre-load a lightweight obstacle detection model (MobileNet-SSD, ~5 MB) on DLA1.", "Update the driver. or \"File a bug with NVIDIA.\" Both are correct long-term actions but don't solve the immediate safety problem for deployed units."], "correct_index": 2}}, {"id": "edge-0538", "title": "The Fleet Firmware Fragmentation", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you handle deploying a new model across this fragmented fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Update all devices to v3.1 first, then deploy the model. This ignores why the fragmentation exists.", "Automatic restart on failure is dangerous for safety-critical systems; manual intervention should always be required.", "The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "The fragmentation exists because OTA updates fail silently in the field."], "correct_index": 3}}, {"id": "edge-0539", "title": "The Rain-Soaked Quantization Cliff", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does quantization amplify the weather-related accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0542", "title": "The Ambarella CV5 Encoding Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Since the model and encoder use separate hardware blocks, what causes the inference latency to spike when the encoder is active?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0543", "title": "The Qualcomm RB5 Hexagon DSP", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can you do better to improve the drone battery life by relocating the ML workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0544", "title": "The Silent NPU Killer", "topic": "safety-certification", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design the system to detect silent NPU failures, ensure functional safety, and implement graceful degradation without external intervention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0545", "title": "The Pruning Paradox on Edge AI Accelerators", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did 80% unstructured pruning make the 3x3 convolution slower on Jetson AGX Orin and drop Tensor Core utilization to near zero?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0546", "title": "The Autonomous Drone's Latency Crisis", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should you use tensor or pipeline parallelism for the 400MB ViT across the two 256MB accelerators, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0548", "title": "The Autonomous Vehicle 'FSDP' Fallacy", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did FSDP/ZeRO-style sharding across two Orins over PCIe Gen4 x8 double latency, and what architecture should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0549", "title": "The Edge Training Scaling Collapse", "topic": "data-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does data-parallel fine-tuning of the 2B-parameter model fail to scale across four Orins connected by PCIe Gen4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0553", "title": "The Automotive Parallelism Dilemma", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should you use tensor or pipeline parallelism for the 20B ViT across two Orins over the 40Gbps link, and why?", "chain_ids": ["edge-chain-auto-secondary-016-09"], "chain_positions": {"edge-chain-auto-secondary-016-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-09": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0556", "title": "The Unaligned Memory DMA Fault", "topic": "extreme-quantization", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the exact same DMA code crash on the M0+?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0557", "title": "The Shared Bus Arbitration Lock", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the NPU frame rate drop to 15 FPS when Ethernet traffic spikes, despite theoretical bandwidth sufficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0558", "title": "The USB Power Suspension", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What OS power management feature is causing this massive cold-start penalty on the USB bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0559", "title": "The Dataflow vs GPU Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 4x TOPS advantage only yielding a 1.25x speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Hailo-8 is a dataflow architecture — it maps the entire model graph onto a spatial pipeline of physical compute units.", "The Orin drivers aren't optimized yet.", "The system is healthy and no recovery action is needed.", "Operator fusion primarily reduces compute time."], "correct_index": 0}}, {"id": "edge-0560", "title": "The Resolution-Accuracy Pareto", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What critical information does resizing destroy, and how do you design a system that meets the deadline while preserving long-range detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Resizing 4K to 640x640 destroys small object features (e.g., a person becomes 3 pixels tall), making detection impossible. Use a multi-scale ROI tiling strategy to run detection on high-res crops.", "The accuracy degradation is caused by numerical instability in the framework's matrix multiplication kernels, not by the model or data.", "Resize to 640x640 and accept the accuracy loss. This treats resolution as a single knob when it's actually a spatial information budget.", "The memory leak is in the framework's autograd graph; disabling gradient computation with torch.no_grad() will fix it."], "correct_index": 0}}, {"id": "edge-0561", "title": "The Stereo Depth vs Monocular Trade-off", "topic": "real-time-deadlines", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the bandwidth, compute, and accuracy trade-offs of replacing stereo with MiDaS-small monocular depth on the TDA4VM forklift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0562", "title": "The Real-Time Scheduling Priority", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why are you still missing deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0563", "title": "The Multi-Resolution Input Strategy", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is switching to 320x320 under thermal throttling the right fix, and how would you design adaptive resolution to preserve safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0564", "title": "The Factory Floor EMI Ghost: Fault Tolerance & Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How is EMI corrupting inference, and where in the data path is it entering?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0566", "title": "The EMC Compliance Nightmare", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the neural network doing that generates RF emissions, and how do you fix it without changing the model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0568", "title": "The Physical Intruder", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What hardware and software mechanisms should be in place to detect and mitigate such an attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0569", "title": "The Early vs Late Fusion Trade-off", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the system-level trade-offs between early and late fusion, and which do you recommend for a Jetson Orin deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0570", "title": "The Edge-Cloud Split Inference", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might splitting the model between edge and cloud be better than either extreme?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0571", "title": "The Dark Silicon Enigma", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What SoC power and thermal envelope limit did the all-engines-at-peak pipeline violate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0572", "title": "The TDA4VM Vision Pipeline", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you partition across the heterogeneous cores to guarantee the 33ms deadline?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 4}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run all three models on the MMA accelerator sequentially. Even if the MMA theoretically supports it, moving models that are better suited for DSP/CPU onto the MMA results in poor utilization.", "Software-only health checks are sufficient; a hardware watchdog timer adds unnecessary complexity to the system design.", "Partition across the heterogeneous SoC by workload characteristics, running YOLOv5s on the MMA, Lane detection on the DSP, and Driver monitoring on the CPU.", "Streaming all raw metrics to the cloud in real-time is the most reliable approach because edge-side aggregation risks losing anomalies."], "correct_index": 2}}, {"id": "edge-0574", "title": "The Futile Pruning", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did the massive reduction in FLOPs yield such a disappointing latency improvement, and why is the proposal to 'just prune more' likely to fail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0575", "title": "The EfficientNet Power Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is a model with 3x fewer operations running slower and consuming the same amount of power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0576", "title": "The Headlight Saturation Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did this performance optimization lead to a catastrophic, non-linear failure instead of a graceful degradation in accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0577", "title": "The Perception Pipeline Performance Cliff", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did moving the identical model from 720p to 1080p cause latency to non-linearly explode to 200ms instead of the predicted ~75ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0578", "title": "The Night-Vision Quantization Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is this daytime-only PTQ calibration and validation insufficient, and what catastrophic failure mode should you expect under night-time glare?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0579", "title": "The Night-Vision Quantization Collapse", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What numerical INT8 failure is occurring on the Orin, and why does it disproportionately affect high-contrast night scenes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0580", "title": "The Night-Blind Perception Model", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What failure mode can daytime-only INT8 calibration introduce, despite 25ms latency and 99% daytime accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0582", "title": "The Efficient Transformer Paradox", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can a 500-GOp / 200MB model beat a 100-GOp / 500MB model on a 30W edge SoC despite doing 5× more operations?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 4}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0583", "title": "The Perception Pipeline Fusion Fallacy", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Under what conditions could this optimization paradoxically increase end-to-end latency and cause you to miss the deadline even more severely?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0584", "title": "The Mixed-Precision Backfire", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did INT8 PTQ make the LiDAR detector miss distant objects, and what activation failure occurs at the first INT8 layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0586", "title": "The Distillation Performance Trap", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why could a lower-FLOP custom replacement for ResNet's Conv-BN-ReLU blocks run slower than the 22ms TensorRT baseline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0587", "title": "The Sensor Fusion PCIe Trap", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What happens if the camera feature map grows from 128MB to 256MB over PCIe Gen4 x8, and why isn't an external InfiniBand link the right fix?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 4}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0588", "title": "The Headlight Glare Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did the 'successful' quantization lead to a catastrophic failure in production, and what is your precise, justified plan to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0589", "title": "The Edge Efficiency Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which candidate should you choose under the 15W Orin budget, and why can the lower-MAC Sparseformer be slower and use more power than the dense CNN?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 4}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0590", "title": "The Catastrophic Night-Drive Quantization Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would an optimization designed to make the model faster cause it to become blind to the most salient objects in a scene, and only under specific conditions?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 2}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0591", "title": "The Self-Defeating Optimization Cascade", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did 50% unstructured pruning and FlashAttention make the 1B VLM slower on Jetson AGX Orin instead of meeting the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0592", "title": "The Headlight Blind Spot", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What calibration mistake can make the INT8 pedestrian detector fail on night-time headlights despite fitting the 15W Orin budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0593", "title": "The Speculative Verifier Cold-Start Stall", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would a drafter-verifier design that invokes the ViT only occasionally blow the 33ms deadline with 100-200ms latency spikes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0594", "title": "The Thermal Throttling Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does Model A's performance collapse under thermal throttling while Model B's only degrades slightly, and which model is better for production?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0595", "title": "The Fused Perception Model Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did fusing the dense and sparse models backfire, leading to a slower and less efficient system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0596", "title": "The Headlight Blindness Quantization Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did PTQ pass daytime validation but fail under headlight glare, and what immediate fix beats simply adding night calibration data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0597", "title": "The Night-Blind Edge Model", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is adding headlight-illuminated images to calibration enough, and what non-linear INT8 failure makes the detector night-blind?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0598", "title": "The Fusion Priority Inversion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would operator fusion that improved average performance cause a catastrophic failure in the system's P99 latency?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 4}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0599", "title": "The Sparse Model Fallacy", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why might the team lead's prediction be catastrophically wrong, causing Model B to have higher latency and worse power efficiency in production?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0600", "title": "The Headlight Overflow Failure", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is the system failing catastrophically instead of showing a graceful mAP drop, and what specific steps would you take to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0601", "title": "The Integer Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you construct the integer roofline for YOLOv8 on the Orin DLA, and why might 18 TOPS not mean compute underutilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0602", "title": "The Watchdog Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How did a low-priority background ML task kill the real-time thread?", "chain_ids": ["edge-chain-auto-secondary-001-06"], "chain_positions": {"edge-chain-auto-secondary-001-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0605", "title": "Power-Adaptive Inference System", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a power-adaptive inference system that maximizes overall detection accuracy under these constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0606", "title": "The Drone Fleet Vision Upgrade", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three design decisions, and how do you justify them quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0607", "title": "The Hard Real-Time Factory Defect Detector", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect the 4K crack detector to use the ViT within 10ms, and why is a naive ViT impossible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0608", "title": "The OTA Thermal Brick", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why could the 2:4-pruned TensorRT model's 30% lab latency win disappear on passively cooled Orins in the field?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0609", "title": "The Multi-Sensor Contention Collapse", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the most likely physical bottleneck causing this counter-intuitive result, and how would you redesign the system software architecture to resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0610", "title": "The Headlight Blindness Problem", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What quantization architecture would meet the 33ms budget while avoiding INT8 activation overflow across extreme lighting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0611", "title": "The Autonomous Drone Reflex Gap", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you run the VLM and locust detector within 30W, double the VLM effective rate to 30 FPS, and alert within 100ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0612", "title": "The Jet-Lagged Copilot LLM", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you redesign the 7B offline copilot to fit 32GB, keep TTFT under 500ms, and exceed 30 tokens/s on edge hardware?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Using a larger model with more parameters will improve both accuracy and latency.", "Post-training quantization always produces equivalent results to QAT.", "FlashAttention-2 solves the prefill capacity limit, and INT4 with Speculative Decoding overcomes the memory bandwidth generation limit.", "Engineers unfamiliar with LLM internals focus on the wrong bottleneck."], "correct_index": 2}}, {"id": "edge-0613", "title": "4MP Camera Upgrade Breaks 33ms on Memory Bandwidth", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does upgrading from 1.2MP to 4MP cameras break the 33ms deadline, and how would you redesign the model for the 45W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0614", "title": "The Autonomous Vision System Fork: ViT vs. Specialized CNN", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which DMS model should you choose for the constrained SoC, ViT or MobileNet-style CNN, and how would you prove it meets the 33ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0616", "title": "The In-Car Assistant Latency Crisis", "topic": "speculative-decoding", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three architectural decisions to meet the latency budget, and how do they combine to solve the problem?", "chain_ids": ["edge-chain-auto-secondary-017-42"], "chain_positions": {"edge-chain-auto-secondary-017-42": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0617", "title": "The Autonomous Perception Deadline Miss", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What multi-stage optimization strategy would bring the 20 TFLOP model below 33ms on Orin, and why in that order?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0618", "title": "The 4D Radar Fusion Bottleneck", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What heterogeneous compute architecture would meet the 33ms/60W constraints, and which workloads would you place on each unit?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 5}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0619", "title": "The Autonomous Perception Deadlock", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you choose a perception model for the 30W passively cooled module using effective TOPS/W rather than FLOPs alone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0621", "title": "Ordering Optimizations for a Transformer on a 15W ECU", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What ordered optimization plan would make the 5x-too-slow valet Transformer meet 33ms on a 15W ECU, and why in that order?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 5}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0622", "title": "The Predictive Overtake Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which proposal is more appropriate for meeting the 50ms latency budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Assuming that because FlashAttention is effective for training in the cloud, it's automatically the best choice for edge inference. At short sequence lengths typical of real-time edge tasks, the N^2 attention matrix may actually fit in on-chip SRAM, diminishing FlashAttention's advantage.", "The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.", "The low utilization is due to kernel launch overhead on the NPU; fusing all layers into a single kernel will achieve near-peak throughput.", "Proposal A is more appropriate: use speculative decoding with the 50M draft model, then verify accepted token blocks with the 2B model; FlashAttention is only a secondary context-pass optimization and cannot close a 10x latency gap by itself."], "correct_index": 3}}, {"id": "edge-0623", "title": "The 8K Sensor Upgrade Dilemma", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For 8K/30FPS on the edge SoC, should you launch DenseViT or FastFusion-CNN, and what roofline bottleneck drives the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0624", "title": "The Unified Automotive Perception Stack Design", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What architecture would you choose for the unified 8-camera perception stack under real-time SoC constraints: CNN, ViT, or a hybrid?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0625", "title": "The 360° Vision System Latency Collapse", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What new architecture would meet the 33ms and 60W budgets for eight cameras while leaving headroom for future LiDAR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to move to a single, unified architecture that performs early fusion and is deployed with hardware-aware compilation.", "The latency is dominated by data preprocessing on the CPU; moving to GPU-accelerated preprocessing will resolve the bottleneck.", "Using a larger model with more parameters will improve both accuracy and latency, because larger models have better computational efficiency.", "Prune the existing 8 models more aggressively to eliminate parameter overhead."], "correct_index": 0}}, {"id": "edge-0626", "title": "The Speculative Braking Rationale Failure", "topic": "speculative-decoding", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What three-point plan gets the 1B-parameter rationale generator under 15ms despite autoregressive decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The inference framework is adding unnecessary overhead; switching to a different runtime (e.g., from TensorRT to ONNX Runtime) will fix the issue.", "The most common L5 answer is 'use a much smaller model.' This ignores the product requirement for a certain level of reasoning quality that the 1B model provides. Another incorrect path is 'batch the sensor inputs,' which makes no sense for a single, real-time braking event.", "The core problem is the sequential target-model pass in auto-regressive decoding. Keep the model resident, use a small draft model, and verify several proposed tokens in one bounded target-model pass.", "The model weights are being duplicated in memory during inference; using model sharding across CPU and GPU will halve the footprint."], "correct_index": 2}}, {"id": "edge-0627", "title": "The Automotive Roofline Dilemma", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which compute architecture would you choose to meet 45W and 30 FPS for the memory-intensive vision model, and why isn't peak TOPS enough?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0628", "title": "The ADAS Look-Ahead Dilemma", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a full-stack system to make a 3B VLM generate 5 tokens within a 33ms perception loop on an Orin already 70% utilized?", "chain_ids": ["edge-chain-auto-001-07"], "chain_positions": {"edge-chain-auto-001-07": 5}, "chain_tiers": {"edge-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0629", "title": "The Autonomous Perception Stack Fork", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should you choose between the two 30W perception designs, and why is achieved TOPS-per-watt more useful than FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0630", "title": "The Night-Drive Quantization Collapse", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why did the INT8 model fail under headlight glare, and how would mixed precision fix it within the 33ms Orin budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0631", "title": "The Impossible OTA Update: Architecting a Generative VLM for an Automotive SoC", "topic": "ota-firmware-updates", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the first three architectural pillars of your OTA VLM plan, and what napkin-math improvement should each provide?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 4}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0632", "title": "The Autonomous Vehicle Power-Performance Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Would you choose a monolithic GPU or a heterogeneous 30W compute module, and how does power duty cycle drive the decision?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 5}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0633", "title": "The Autonomous Valet Retrofit", "topic": "ota-firmware-updates", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What phased optimization strategy would get the OTA Transformer BEV model from 200ms to 33ms with the least deployment risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0634", "title": "The Autonomous Freeway Thermal Throttling Crisis", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you evaluate and select Gen-2 edge hardware that meets 30 FPS within the 60W passive-cooling budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0635", "title": "The Autonomous Fleet Compute Upgrade", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why is the BEV-Transformer missing the 33ms deadline, and what should you optimize first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0636", "title": "The Autonomous Perception Power Wall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are your first three architectural decisions, and how do you justify them with quantitative reasoning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0637", "title": "The Headlight Flare Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you resolve the HDR INT8 saturation failure and prevent data-dependent quantization failures from recurring?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0638", "title": "The Autonomous Vision Unification Dilemma", "topic": "attention-scaling", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you make a 2B-parameter ViT replacement for the v4 CNN stack fit the 33ms edge latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0639", "title": "The Perception Platform Redesign", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What roadmap would you propose to move beyond the ResNet CNN stack while staying within the 30W and 33ms Orin limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0640", "title": "The Over-Budget Driver Intention Model", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What multi-stage model, algorithm, and hardware optimization pipeline would you use to get a 15x speedup without dropping below 98% accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0641", "title": "The Headlight Blindness Catastrophe", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the most likely physical root cause, and how would you redesign your team's quantization and deployment architecture to prevent this entire class of failures in the future?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 3}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0644", "title": "The Autonomous 'Cloud-to-Edge' VLM Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should you deploy a 100B-parameter VLM on a 16 GB edge platform when 50ms covers the full hazard response?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0645", "title": "The Foggy Road Catastrophe", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why did INT8 PTQ preserve overall accuracy but fail on foggy pedestrians, and how would you fix it?", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 3}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0649", "title": "The Autonomous Perception Horizon", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Which proposal is viable within the 33ms latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0651", "title": "The Autonomous Perception Stack Redesign: Transformer Systems Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you evaluate whether the 5% mAP ViT can replace the CNN on Orin, and what is wrong with a FLOPs-only analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0653", "title": "The Autonomous Stack Consolidation", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For a 200ms unified Transformer on Jetson AGX Orin, what bounded optimization plan reaches the <25ms perception budget, and what must change if optimization is not enough?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0655", "title": "The Unexpected Cache Miss Storm", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is a common hardware-level reason for such a slowdown in data-intensive loops, even on fast CPUs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0657", "title": "The Bloated INT8 Model", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What explains this discrepancy, and what other components contribute significantly to the overall memory footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0658", "title": "DRAM Bandwidth for 30 FPS Inference", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Do you have enough DRAM bandwidth to run DeepLabv3-MobileNetV2 at 30 FPS alongside the ISP and display?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, the stated average demand is about 6.063 GB/s, well below 25.6 GB/s; still profile activation traffic and ISP bursts for latency spikes.", "Yes, model weights are 2.1 MB, so at 30 FPS it only uses 63 MB/s, which is <1% of 25.6 GB/s.", "No, the ISP continuous read of 4 GB/s plus the GPU's 13.3 GB/s burst will exceed 25.6 GB/s.", "No, deep segmentation models require at least 32 GB/s of bandwidth due to self-attention activation mapping."], "correct_index": 0}}, {"id": "edge-0659", "title": "Memory for Multi-Camera Tracking", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total memory footprint for the 4-camera pipeline, and is 16 GB sufficient?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 1}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0660", "title": "The DMA Contention Blind Spot", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What physical limitation is causing the simultaneous frame drop and inference latency spike when neither device saturates the bus?", "chain_ids": ["edge-chain-auto-001-05"], "chain_positions": {"edge-chain-auto-001-05": 3}, "chain_tiers": {"edge-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0661", "title": "The MMIO Sensor Bottleneck", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does the I2C sensor polling rate limit the ML model's input freshness, and why does this bus bottleneck cause the model's accuracy to drop despite the GPU running at full speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0662", "title": "The Data Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is a likely bottleneck, and how would you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0663", "title": "The Swap File Latency Cliff", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Jetson Nano with a 4GB swap file experience erratic 4,000ms latency spikes during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0664", "title": "The eMMC Cold Start", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the first inference take 2.3 seconds longer than the theoretical eMMC transfer time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0665", "title": "The Object Tracking Memory Budget", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is tracking memory truly negligible, and when does it become a bottleneck?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 2}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0666", "title": "The Edge GPU Memory Bandwidth", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the colleague's estimate wrong in both directions — too pessimistic about compute, too optimistic about the real bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The latency is dominated by data preprocessing on the CPU; moving to GPU-accelerated preprocessing will resolve the bottleneck.", "Performance scales linearly with TOPS. This assumes the workload is compute-bound on both platforms and ignores memory traffic and layer fusion.", "The colleague's estimate is wrong because peak TOPS alone misses the roofline: the Orin run is limited by attainable memory bandwidth and effective layer traffic, while the 4090 baseline does not prove both devices have the same bottleneck.", "INT8 quantization reduces model size by 8x compared to FP32, so inference speed should also improve by exactly 8x."], "correct_index": 2}}, {"id": "edge-0667", "title": "The Inference Memory Leak", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where is the 5.4 GB leak coming from over 3 days of continuous inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has a memory leak — switch to a different framework.. The model itself is stateless between inferences.", "The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.", "A rare V4L2/DMA buffer release bug or retained frame reference leaks camera buffers over days, even though the model is loaded once and the intended ring buffer is fixed-size.", "Converting the model to a different framework's format (e.g., ONNX to TFLite) will automatically optimize it for the target hardware."], "correct_index": 2}}, {"id": "edge-0672", "title": "The Repeated Model Loading Memory Leak", "topic": "compound-ai-systems", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's leaking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["\"The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.\"", "\"The model files aren't being freed — add explicit `del model` calls.\"", "\"The pipeline bottleneck is always the largest model; optimizing smaller models in the pipeline has negligible impact on end-to-end latency.\"", "\"The Qualcomm SNPE (Snapdragon Neural Processing Engine) runtime allocates intermediate activation buffers on the Hexagon DSP's shared memory (ION/DMA-BUF allocations) each time a model is loaded. When the model is unloaded, SNPE releases the model weights but doesn't fully release the DSP's scratch memory allocations.\""], "correct_index": 3}}, {"id": "edge-0673", "title": "The Zero-Copy Illusion", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "If the pointer was passed directly, why is there still a latency spike?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0674", "title": "The Multi-Model Memory Sharing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you build a concrete memory budget for the 760 MB ML working set and preserve room for future models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0675", "title": "The Edge LLM Memory Wall", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What's consuming the memory?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 2}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is leaking memory. or \"Activations are too large.\" The weights are static and activations are small for a single-token decode step.", "The KV-cache. During autoregressive generation, the model stores the key and value tensors for every token generated so far, across every layer. For Phi-3-mini (32 layers, 32 heads, head_dim=96): KV-cache per token = 2 (K+V) × 32 layers × 32 heads × 96 dim × 2 bytes (FP16) = 393 KB per token.", "If the inference process is still running (visible in ps/top), the system is healthy and no recovery action is needed.", "Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count."], "correct_index": 1}}, {"id": "edge-0676", "title": "The Occupancy Grid Map Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the actual memory and bandwidth requirements for the 200m occupancy grid, and when does it become the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Switching to a lower precision format (e.g., INT4) will double the effective compute throughput and solve the utilization gap.", "Deploy the update to all devices simultaneously to minimize the total rollout duration and reduce the window of version inconsistency.", "A 2D grid is simple and cheap.. This ignores the update rate, temporal history, and bandwidth implications of a high-resolution, high-frequency map.", "Grid dimensions: 200m / 0.1m = 2000 cells per axis. Total cells: 2000 × 2000 = 4 million cells. Per-cell storage: 4 + 4 + 1 = 9 bytes."], "correct_index": 3}}, {"id": "edge-0677", "title": "The Brownout Weight Corruption: Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What happened to the model in RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0681", "title": "The Transformer Patch Limit", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the ViT completely crash with an OOM error while the CNN merely slows down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0682", "title": "The LiDAR Point Cloud Memory Explosion", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where is the bottleneck?", "chain_ids": ["edge-chain-auto-001-08"], "chain_positions": {"edge-chain-auto-001-08": 3}, "chain_tiers": {"edge-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable.", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets.", "The bottleneck is point cloud preprocessing, not model inference. Voxelizing 1.2M points (5 sweeps x 240K) involves random memory scatter operations — each point maps to a random voxel location. CPU-based voxelization achieves only ~5 GB/s effective bandwidth (random access pattern) vs 204.8 GB/s peak, taking ~33ms.", "The model inference is too slow.. PointPillars on an Orin GPU takes ~15ms."], "correct_index": 2}}, {"id": "edge-0683", "title": "CPU Cluster Cache Contention on Edge SoC", "topic": "mlops-lifecycle", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What advanced memory architecture concept might explain this throughput decrease, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0684", "title": "The Shared Bandwidth Bottleneck", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the most probable system-level bottleneck, and how would you redesign the system to mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0685", "title": "The Zero-Copy Nightmare", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What architectural requirements, challenges, and pitfalls determine whether true zero-copy tensor sharing works on this SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0686", "title": "The Fixed-Point Trade-off", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does FP32-to-fixed-point deployment drop mAP from 90% to 65%, and what steps would you take to fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0687", "title": "The PTQ vs QAT Question", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do you need two weeks for quantization-aware training instead of a five-minute TensorRT INT8 PTQ pass?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0688", "title": "The Coral Edge TPU Quantization Constraint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are your options to handle the 4 fallback layers, and which provides the best tradeoff between accuracy and battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0689", "title": "The Night Scene Calibration Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong with the INT8 PTQ calibration?", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 1}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0690", "title": "The INT8 Calibration Drift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong with the quantization, and how do you fix it without retraining?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0691", "title": "INT8 Calibration Set Size vs Accuracy", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many calibration images do you actually need, and what happens if you use too few or too many?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "Use all 50,000 images for calibration — more data is always better.. Calibration is not training.", "Calibration determines the min/max (or percentile) range of activations per layer to set the INT8 scale factors. The key insight: you need enough samples to capture the activation distribution's tails, not to train the model."], "correct_index": 3}}, {"id": "edge-0692", "title": "Quantization Impact on Detection mAP", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which precision should you deploy for 8 camera streams under a 33ms budget, and what latency and throughput do FP32, FP16, and INT8 imply?", "chain_ids": ["edge-chain-bucket-realtime-04"], "chain_positions": {"edge-chain-bucket-realtime-04": 1}, "chain_tiers": {"edge-chain-bucket-realtime-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0693", "title": "The QAT Cliff", "topic": "safety-certification", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What went wrong with PTQ, and what is the principled fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0694", "title": "The Disappearing Pedestrian", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the common pitfalls of deploying FP32 models to INT8 hardware, especially concerning robustness, and how would you diagnose and mitigate these issues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0695", "title": "The Mixed-Precision Perception Stack", "topic": "real-time-deadlines", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What precision would you use for object detection, monocular depth, and motion planning on the Orin, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0696", "title": "The Multi-Core Bottleneck", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you decide where to run the pre-processing stage to minimize overall pipeline latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0697", "title": "The DLA vs GPU Scheduling", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "When does offloading the backbone to a DLA actually improve performance over running everything on the faster GPU?", "chain_ids": ["edge-chain-auto-secondary-008-11"], "chain_positions": {"edge-chain-auto-secondary-008-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Using a larger model with more parameters will improve both accuracy and latency, because larger models have better computational efficiency.", "'Your manager is partially right — for a single model on a plugged-in system, the GPU is faster and simpler. But DLA scheduling matters in three scenarios: (1) Pipeline parallelism — the DLA runs the detection backbone while the GPU simultaneously runs the tracking model.", "The thermal issue is caused by ambient temperature; adding a larger heatsink will fully solve the throttling without any software changes.", "DLA is always better because it's more power-efficient. or GPU is always better because it's faster. Both ignore the scheduling context."], "correct_index": 1}}, {"id": "edge-0698", "title": "The Edge GPU Driver Crash Loop", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What's different about these 30 devices, and how do you design the system to tolerate GPU driver crashes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0700", "title": "The Heterogeneous Scheduler's Dilemma", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you design a real-time task scheduler and resource allocator to ensure all critical tasks meet their deadlines while optimizing for power efficiency and overall system utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model should be retrained with fewer parameters to reduce inference time, as model size is the primary driver of latency.", "\"Just put everything on the NPU, it's fastest.\"", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets.", "A sophisticated, multi-level scheduling and resource management approach is required. It involves workload profiling, static vs. dynamic scheduling, hardware-aware task mapping, and resource isolation."], "correct_index": 3}}, {"id": "edge-0701", "title": "The Edge LLM Context Window", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What went wrong with the multi-turn generation latency, and how is memory bandwidth involved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0702", "title": "Scheduling Orin GPU, DLAs, and CPU Under a 33ms Deadline", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you schedule these across the Orin's GPU, 2 DLAs, and CPU to meet the deadline?", "chain_ids": ["edge-chain-auto-secondary-008-11"], "chain_positions": {"edge-chain-auto-secondary-008-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0703", "title": "Multi-Hardware Model Optimization Pipeline", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a CI/CD pipeline that builds and validates optimized binaries for Orin NX, Hailo-8, and Coral from one PyTorch checkpoint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0704", "title": "The Adaptive Model Diet", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you design a runtime that switches among model variants to meet changing latency, throughput, thermal, and power budgets without human intervention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0705", "title": "The Heterogeneous Choreographer", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition and schedule the multi-stage ML pipeline across CPU, GPU, DSP, and NPU to minimize latency and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0706", "title": "The Functional Safety Redundancy Cost", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an ASIL-D redundant perception path within 50ms without duplicating the GPU, model, and sensor hardware?", "chain_ids": ["edge-chain-auto-secondary-008-02"], "chain_positions": {"edge-chain-auto-secondary-008-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0708", "title": "The NPU Definition", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does NPU stand for, and what mathematical operation is it physically optimized to perform?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Network Processing Unit; optimized for fast Wi-Fi routing.", "Neural Processing Unit; optimized for dense Multiply-Accumulate (MAC) operations.", "Node Partition Unit; optimized for virtualizing the edge operating system.", "Numeric Precision Unit; optimized for high-accuracy 64-bit floating point math."], "correct_index": 1}}, {"id": "edge-0709", "title": "Thermal Throttling on Edge", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What physical constraint is most likely causing this sudden and permanent drop in performance?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 0}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's weights have drifted due to prolonged inference.", "The OS garbage collector is pausing the inference thread.", "The device overheated, causing the OS to drop the clock frequency (thermal throttling).", "The L1 cache has become permanently fragmented."], "correct_index": 2}}, {"id": "edge-0710", "title": "The A/B Partitioning Storage Tax", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What minimum flash storage is required for a 1 MB firmware image with A/B OTA partitions and 100 KB bootloader/OS overhead?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 0}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.1 MB", "~2.1 MB", "~2.0 MB", "Slightly more than 1 MB"], "correct_index": 1}}, {"id": "edge-0711", "title": "The Hard Real-Time Heartbeat", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a reasonable timeout to set for this watchdog?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 ms", "100 ms", "1 second", "40 ms, the round-trip-time for US cross-country fiber"], "correct_index": 1}}, {"id": "edge-0716", "title": "The Frozen Robot Problem", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hardware mechanism would reboot the robot if its perception software enters an infinite loop and freezes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A software liveness probe that pings a monitoring service.", "Error-Correcting Code (ECC) memory to prevent corruption.", "A hardware watchdog timer that triggers a CPU reset.", "A graceful degradation module that switches to a simpler model."], "correct_index": 2}}, {"id": "edge-0720", "title": "The OTA Flash Budget", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When planning the over-the-air (OTA) update, which of the following is the most fundamental constraint to identify first?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The vehicle's 4G/5G network bandwidth for the download.", "The power consumed by the flash write operation.", "The available storage space in the inactive firmware partition.", "The compute time required to validate the new model post-installation."], "correct_index": 2}}, {"id": "edge-0722", "title": "The OTA Update Budget", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you calculate the FP16 model weight size for 50M parameters, and what is the total download size with a 150 MB container base?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 MB", "100 MB", "250 MB", "350 MB"], "correct_index": 2}}, {"id": "edge-0723", "title": "The OTA Download Fallacy", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long does an 8 GB update take to download over a 1 Gbps cellular connection, ignoring protocol overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 seconds", "4 seconds", "64 seconds", "32 seconds"], "correct_index": 2}}, {"id": "edge-0724", "title": "The Overnight OTA Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long does the 8 GB update take at 10 Mbps, and is the 8-hour overnight window sufficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~13.3 minutes", "~1.8 hours", "~2.4 hours", "~14.2 hours"], "correct_index": 1}}, {"id": "edge-0730", "title": "The Emergency OTA Rollout", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long will the 512 MB OTA package take to download over a 100 Mbps connection?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5.1 seconds", "41.0 seconds", "328 seconds", "0.6 seconds"], "correct_index": 1}}, {"id": "edge-0731", "title": "The OTA Update Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does the $2M one-time R&D cost compare to lifetime OTA data costs for 100,000 vehicles over 5 years?", "chain_ids": ["edge-chain-auto-secondary-001-10"], "chain_positions": {"edge-chain-auto-secondary-001-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The R&D cost is $2M, while the data cost is only $400,000 (Trap: calculated 1 year of updates).", "The data cost is $10M, dwarfing the R&D cost (Trap: forgot to convert MB to GB).", "The R&D cost ($2M) and the total data cost ($2M) are exactly equal.", "The costs are negligible because 200MB is small."], "correct_index": 2}}, {"id": "edge-0734", "title": "The OTA Downtime Tax", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum downtime to write the 4 GB model and 32 GB map to UFS 4.0 flash during installation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8.6 seconds", "~17.1 seconds", "~1.9 seconds", "~7.6 seconds"], "correct_index": 1}}, {"id": "edge-0738", "title": "The Automotive OTA Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long will the 2 GB OTA update download take over a stable 40 Mbps cellular connection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~51 seconds", "~16 seconds", "~6.8 minutes", "~2.7 minutes"], "correct_index": 2}}, {"id": "edge-0739", "title": "The OTA Flash Budget Crunch", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the 120 MB vision model update be safely deployed on the 512 MB ECU, and how much flash remains free?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 1}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it fits. The OS and app use 248 MB, leaving 264 MB of free space.", "Yes, it fits. The 120 MB model can be downloaded directly into the 150 MB OTA partition.", "No, it does not fit. After reserving space for the OS and the OTA partition, only 114 MB of flash remains, which is less than the 120 MB required.", "Yes, it fits. After the OS (48 MB) and OTA partition (150 MB) are reserved, there is 314 MB of space for the application."], "correct_index": 2}}, {"id": "edge-0740", "title": "The Overnight OTA Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the minimum time required to download this 300 MB OTA package over a 10 Mbps cellular link?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30 seconds", "4 minutes", "3.3 minutes", "40 minutes"], "correct_index": 1}}, {"id": "edge-0741", "title": "The OTA Bandwidth Trap: OTA & Firmware Updates", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Roughly how long will one vehicle take to download the 8 GB OTA container over a sustained 100 Mbps connection?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 0}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~80 seconds (~1.3 minutes)", "~11 minutes (~640 seconds)", "~1 minute", "~1 hour"], "correct_index": 1}}, {"id": "edge-0742", "title": "The OTA Update Bottleneck", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum OTA download time for one robot to download the 120 MB model over the 10 Mbps cellular link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12 seconds (Calculated Trap)", "96 seconds", "2400 seconds (Calculated Trap)", "9.6 seconds (Calculated Trap)"], "correct_index": 1}}, {"id": "edge-0746", "title": "The Edge Container Overhead", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does Docker's memory overhead reduce the unified memory available for ML model weights and activations on Jetson, and what does the actual memory budget look like?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0747", "title": "The Bricked OTA Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why are ML model updates significantly more dangerous than generic firmware updates in constrained environments, and how does size dictate partition architecture?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 2}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0748", "title": "The Watchdog Timer", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you detect and recover from a silent TensorRT engine hang when the OS remains responsive?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Implement a hardware watchdog timer — a dedicated hardware peripheral that must be \"kicked\" at regular intervals. If the kick doesn't arrive within the timeout period, the watchdog triggers a hard reset.", "Converting the model to a different framework's format (e.g., ONNX to TFLite) will automatically optimize it for the target hardware.", "Check if the process is running.. The process *is* running — it's blocked inside a CUDA call.", "The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes."], "correct_index": 0}}, {"id": "edge-0749", "title": "The Edge Data Collection Funnel", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should the system decide which images to upload to collect valuable training data without exceeding the 2 GB/month limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0750", "title": "The Zero-Touch Provisioning Pipeline", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is generic provisioning insufficient for 1,000 Coral devices, and what hardware-specific model compilation and calibration must it include?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 0}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0752", "title": "The Gradual Rollout Guru", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design the system to enable this A/B testing, ensuring a smooth rollout and easy rollback if issues arise?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 0}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0753", "title": "The OTA Brick Risk", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is pushing a TensorRT .engine OTA riskier than a generic firmware update, and how must deployment handle model-runtime coupling?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 3}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0754", "title": "The Boot Time Budget", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you cut boot-to-first-detection from 22 seconds to under 3 seconds, and which changes are mandatory versus low-value optimizations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0755", "title": "The Resource Tug-of-War", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you ensure the safety model always meets its deadlines without significantly starving the analytics model when both contend for the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0756", "title": "The Edge Model A/B Testing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why doesn't cloud-style A/B testing work on edge, and what's the alternative?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 1}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0757", "title": "The Canary Deployment Gone Wrong", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What went wrong with your canary strategy?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0758", "title": "The Silent Accuracy Drift", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you detect and diagnose accuracy drift without ground truth labels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0760", "title": "Diagnosis: Unrecoverable Boot Loop from Coupled OTA Rollback", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What went wrong with your rollback strategy?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 2}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0761", "title": "Prometheus Edge Aggregation for Camera Fleet Metrics", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should Prometheus telemetry be redesigned to avoid cardinality overload with edge aggregation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0762", "title": "The Edge-Cloud Sync Conflict", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you handle the sync, and what happens to the stale devices' inference results in the meantime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0763", "title": "The Hardware SKU Qualification Matrix", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is this wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0764", "title": "The Inference Audit Trail Gap", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong to cause the deterministic model to produce non-deterministic outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0766", "title": "The CAN Bus Telemetry Flood", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What went wrong, and how do you fix it without removing the telemetry?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0767", "title": "The Cellular Diet", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you efficiently manage this deployment without massive overages?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0770", "title": "The Offline-First Edge Design", "topic": "safety-certification", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design the system to operate independently of cloud connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0771", "title": "The Model Versioning Fleet Problem", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many distinct model binaries do you need to maintain, and what's the real operational cost?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 1}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0772", "title": "The Hardware Lifecycle Cliff", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a 3-year transition strategy for 50,000 Jetson TX2 devices reaching end of life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0773", "title": "The Disconnected Brain", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you ensure reliable model updates, send diagnostic telemetry, and maintain local inference capability when the connection drops for extended periods?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0774", "title": "The Fleet Heterogeneity Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you handle the 8x compute gap between the weakest and strongest devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Train one model and compile it for each platform.. A model that runs well on the Orin will OOM or miss deadlines on the Nano.", "The memory leak is in the framework's autograd graph; disabling gradient computation with torch.no_grad() will fix it.", "You need a model tiering strategy — multiple model variants compiled from the same training run, each targeting a hardware tier:", "OTA updates should always include the full model file to ensure atomicity; delta updates risk corrupting the model."], "correct_index": 2}}, {"id": "edge-0775", "title": "The Bandwidth-Constrained Model Update", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ship the 8.1 MB model update without blowing the 500 MB/month cellular data budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0776", "title": "The On-Device Drift Detector", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design an on-device drift detection system for the Hailo-8 quality inspection fleet within the 2.5 W power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0777", "title": "The Predictive Maintenance Model Lifecycle", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's happening, and how do you fix the lifecycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0778", "title": "The Polyglot Fleet", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you efficiently build, test, and deploy ML models across this heterogeneous fleet?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 2}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0779", "title": "The Canary in the Coal Mine", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you implement a safe, phased rollout strategy with robust monitoring to catch issues early?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 2}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0781", "title": "Fleet-Wide Model Drift Detection Threshold", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What statistical threshold should trigger a drift alert, and is the mean confidence drop from 0.82 to 0.71 real drift or normal variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0782", "title": "The Inconspicuous Sticker Attack", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design your edge vision system to detect and mitigate such physical-world adversarial attacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0783", "title": "Metrics and Alert Thresholds for Cellular Edge Fleet Monitoring", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What metrics do you collect, how do you aggregate them, and what are your alerting thresholds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0784", "title": "The Remote Debugging Nightmare", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you diagnose and fix the erratic inference issue remotely over a high-latency, low-bandwidth connection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0785", "title": "Privacy-Preserving Edge ML with Federated Learning and Differential Privacy", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design an end-to-end system that respects privacy while enabling ML development?", "chain_ids": ["edge-chain-auto-secondary-017-41"], "chain_positions": {"edge-chain-auto-secondary-017-41": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0787", "title": "The Privacy-Preserving Drift Correction", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you fix the model without ever seeing the raw data?", "chain_ids": ["edge-chain-auto-secondary-017-40"], "chain_positions": {"edge-chain-auto-secondary-017-40": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0788", "title": "Self-Healing Edge AI Fleet", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a self-healing system to cut interventions from 150 per week to fewer than 15 without adding SREs?", "chain_ids": ["edge-chain-auto-022-10"], "chain_positions": {"edge-chain-auto-022-10": 3}, "chain_tiers": {"edge-chain-auto-022-10": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count.", "Classify failure modes by frequency and automate top causes. Typical: (1) model OOM/crash (40%) — automated restart + fallback model; (2) connectivity loss (25%) — local buffering + exponential backoff; (3) sensor degradation (20%) — automated recalibration triggers; (4) storage full (10%) — automated log rotation.", "The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes.", "Add better monitoring and alerting.. More alerts without automated remediation just increases alert fatigue."], "correct_index": 1}}, {"id": "edge-0789", "title": "The Watchdog Blind Spot", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why didn't the hardware watchdog trigger a reboot, and how do you design a watchdog system that actually monitors the ML hardware?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 1}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0793", "title": "The 5-Year Edge Device Lifecycle", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the failure modes you must design for, and how do you achieve the availability target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0794", "title": "The Unattended Fleet", "topic": "ota-firmware-updates", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you achieve 'self-healing' and predictive maintenance for both the ML models and the underlying hardware/software stack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0797", "title": "The ISO 26262 Neural Network Problem", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you certify a neural network under ISO 26262?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0798", "title": "The Remote Fleet Update Dilemma", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a secure OTA update system for remote safety-critical edge devices that prevents bricking and supports rollback after dropped connections?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0799", "title": "The Supply Chain Attack", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How could an attacker inject a backdoored model through the supply chain, and how do model-specific integrity checks differ from generic binary attestation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0802", "title": "The Adversarial Patch Attack", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you defend against this adversarial patch attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0803", "title": "The Model IP Leak", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you protect the model's intellectual property and ensure its integrity against reverse engineering or malicious modification on the edge device itself?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0804", "title": "The Model Theft from Edge Device", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do they extract your model, and what can you do to prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable.", "Attack vectors escalate: (1) Unencrypted disk: 500 + 10 min. (2) Encrypted, no HSM: 5K JTAG + 1 week. (3) API distillation: 100K queries = $50K total. To prevent it, use a full defense stack (secure boot + TEE + HSM + rate-limited API).", "The safest strategy is to update firmware and model in a single atomic package to avoid version skew between them.", "Encrypt the model file on disk."], "correct_index": 1}}, {"id": "edge-0805", "title": "The Autonomous Vehicle Compliance Log", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you log everything without impacting real-time inference or filling the onboard 512 GB NVMe in a day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0806", "title": "The Tamper-Proof Model Fortress", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you secure the ML model from manufacturing to runtime so attackers with physical access cannot modify, replace, or exfiltrate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0807", "title": "The Physical Adversarial Gauntlet", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you design the robot's perception and decision-making system to be robust against such 'physical world' adversarial attacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Edge-side health checks should focus on hardware metrics (CPU, memory); model-level metrics like confidence scores are too noisy to be useful.", "Just train with more adversarial examples. While data augmentation helps, physical attacks often exploit subtle sensor-level vulnerabilities or cross-modal discrepancies that simple data augmentation won't cover.", "The bottleneck is PCIe transfer between CPU and GPU; using zero-copy memory will eliminate this overhead entirely.", "A multi-layered defense strategy is required:\n  1. Multi-Modal Redundancy & Fusion: Don't rely solely on one sensor type. An attack targeting a camera (e.g., sticker on a sign) might not affect Lidar or Radar.\n"], "correct_index": 3}}, {"id": "edge-0808", "title": "The Model Fortress", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Assuming a determined attacker with physical access, how do you protect the model's intellectual property on the device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0809", "title": "The Silent eMMC Death", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does continuous ML inference result logging create a write amplification pattern that kills eMMC faster than generic logging?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0810", "title": "Edge-Cloud Federated Learning System", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a HIPAA-compliant federated learning system for the 500 monitors, including protocol, privacy, convergence, and device budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0811", "title": "The Multicast Model Update Storm", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bandwidth and delivery time do unicast and multicast require for the 150 MB update to 200 Orin nodes, and why does unicast saturate the uplink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0812", "title": "The DLA-GPU Pipeline Overlap", "topic": "3d-parallelism", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the maximum pipeline throughput of this heterogeneous setup compared to sequential execution, and what is the new bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0813", "title": "The CUDA Stream Contention Trap", "topic": "3d-parallelism", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why do the two CUDA-stream models on the Orin serialize, and what fix would achieve at least partial overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0815", "title": "The Fleet OTA Bandwidth Budget", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What are the full-update cost and time, the delta-update savings, and the total cost impact if 3% of devices have corrupted base models and require full updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0816", "title": "The Edge Inference Offload Decision", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the local versus offloaded end-to-end latencies, and below what bandwidth does offloading become slower than 15 ms local inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0817", "title": "The Multi-Model Memory Tetris", "topic": "3d-parallelism", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should the models be scheduled within 32GB and 50ms, and what changes when adding the 6th 4GB-peak, 20ms model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run all 5 models sequentially: 40+15+8+25+12 = 100ms. This exceeds the deadline, so models must be pruned.", "Run all 5 models concurrently using the stated 18 GB peak activation/workspace budget: max(40,15,8,25,12) = 40ms, which meets the deadline and fits within 32 GB. Adding the 6th 4 GB, 20ms model raises peak memory to about 22 GB and still leaves latency at 40ms.", "Schedule in two phases to respect memory limits. Phase 1: detection, path, gesture (15ms). Phase 2: speech, SLAM. Total is 55ms, so this misses the 50ms deadline unless SLAM is separately optimized.", "Only budget model weights (6.9 GB) and run everything concurrently, ignoring activation memory and workspace overhead."], "correct_index": 1}}, {"id": "edge-0818", "title": "The RTSP Stream Bandwidth Saturation", "topic": "real-time-deadlines", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Calculate total ingest bandwidth, decode throughput required, and determine the bottleneck — is it network, decode, or inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0819", "title": "Evaluating Robust Aggregation in Corrupted Federated Networks", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which robust aggregator—Krum, coordinate-wise median, or trimmed mean—should replace FedAvg for 10,000 clients with 2% corrupted gradients?", "chain_ids": ["edge-chain-auto-024-06"], "chain_positions": {"edge-chain-auto-024-06": 2}, "chain_tiers": {"edge-chain-auto-024-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0820", "title": "Diagnosing Model Poisoning in FedAvg", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does FedAvg fail when 50 of 10,000 devices send L2-25 updates, and how should aggregation be modified?", "chain_ids": ["edge-chain-auto-024-06"], "chain_positions": {"edge-chain-auto-024-06": 1}, "chain_tiers": {"edge-chain-auto-024-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0821", "title": "Mitigating Byzantine Poisoning in Federated Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What L2 clipping threshold C limits the attackers' total update norm to 10% of the honest devices' total update norm?", "chain_ids": ["edge-chain-auto-024-06"], "chain_positions": {"edge-chain-auto-024-06": 0}, "chain_tiers": {"edge-chain-auto-024-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0822", "title": "Memory Bounds of EWC Mitigations", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory do the EWC terms add, does the 2M-parameter FP16 model fit in 16 MB SRAM, and what should you use instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0823", "title": "Diagnosing On-Device Catastrophic Forgetting", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Diagnose the root cause of the base class forgetting and propose a mitigation strategy that fits within a strict 10MB memory constraint?", "chain_ids": ["edge-chain-auto-001-04"], "chain_positions": {"edge-chain-auto-001-04": 0}, "chain_tiers": {"edge-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0824", "title": "Mitigating Catastrophic Forgetting on Edge", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which mitigation—EWC, a 50-image replay buffer, or LoRA/classifier head—best prevents forgetting on the 4 TOPS, 512 MB smart camera?", "chain_ids": ["edge-chain-auto-001-04"], "chain_positions": {"edge-chain-auto-001-04": 1}, "chain_tiers": {"edge-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0825", "title": "Memory Overhead of EWC on Edge", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total parameter-state memory footprint with EWC and SGD momentum, does it fit in 8 MB SRAM, and how can you make it fit?", "chain_ids": ["edge-chain-auto-025-01"], "chain_positions": {"edge-chain-auto-025-01": 0}, "chain_tiers": {"edge-chain-auto-025-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0826", "title": "Debugging EWC Memory Overheads on MCU", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does enabling EWC make fine-tuning the 25K-parameter wake-word model OOM on 256 KB SRAM, and how would you fix it?", "chain_ids": ["edge-chain-auto-025-01"], "chain_positions": {"edge-chain-auto-025-01": 1}, "chain_tiers": {"edge-chain-auto-025-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0827", "title": "On-Device Continual Learning: EWC vs Replay Buffers", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use EWC or the 10MB replay buffer for this 50M-parameter assistant, and what is the on-device training memory cost?", "chain_ids": ["edge-chain-auto-025-01"], "chain_positions": {"edge-chain-auto-025-01": 2}, "chain_tiers": {"edge-chain-auto-025-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0828", "title": "Edge Experience Replay Buffer Sizing", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many historical frames fit in the 64MB buffer as uncompressed FP32 versus heavily compressed 25KB JPEGs, and which format is structurally superior for this task?", "chain_ids": ["edge-chain-auto-025-02"], "chain_positions": {"edge-chain-auto-025-02": 0}, "chain_tiers": {"edge-chain-auto-025-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0829", "title": "Diagnosing Forgetting in Edge Anomaly Detection", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the day-one bearing anomaly to be forgotten, and what flash-backed replay strategy would you use within the 1GB limit?", "chain_ids": ["edge-chain-auto-025-02"], "chain_positions": {"edge-chain-auto-025-02": 1}, "chain_tiers": {"edge-chain-auto-025-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Relying purely on regularization techniques like Elastic Weight Consolidation (EWC) which introduces significant compute overhead on the edge CPU, or simply freezing the lower layers, which limits adaptation to new anomalies.", "Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count.", "Implement a reservoir-sampled Experience Replay buffer. Allocate 500MB of the available flash storage to maintain a diverse set of historical edge cases.", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets."], "correct_index": 2}}, {"id": "edge-0830", "title": "Latent vs. Raw Experience Replay Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the 1MB Flash replay buffer store 2KB raw sensor windows or 256-byte latent embeddings, and what are the quantitative trade-offs?", "chain_ids": ["edge-chain-auto-025-02"], "chain_positions": {"edge-chain-auto-025-02": 2}, "chain_tiers": {"edge-chain-auto-025-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0831", "title": "Debugging SecAgg Stragglers and Metric Bias", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are 85% of SecAgg rounds failing, and why do successful rounds report 12ms latency instead of the 35ms fleet average?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0832", "title": "Designing Privacy-Preserving Fleet-Wide Analytics", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you estimate the global package-detection FPR under LDP while keeping each doorbell under 5KB/day of uplink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0833", "title": "Federated Analytics SecAgg Overhead", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the per-camera upload and total server ingress for one SecAgg round with 1,000-bin histograms and k=100 neighbors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0834", "title": "Debugging OOM in Edge Device Finetuning", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the OOM during 4096-token LoRA fine-tuning on 16GB memory, and what software mitigation would you use?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0835", "title": "On-Device Fine-Tuning Memory Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With layer-wise gradient checkpointing, what is the new activation memory footprint and percentage increase in total training FLOPs?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0836", "title": "On-Device LLM Fine-Tuning Memory Trade-offs", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you enable gradient checkpointing for the 1.5B LoRA fine-tune on the Orin Nano, or use swap or shorter sequences instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0837", "title": "Maximum Gradient Bit-Width for 32M-Parameter Upload at 2 Mbps", "topic": "extreme-quantization", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum gradient quantization bit-width lets the 32M-parameter model upload within 40 seconds at 2 Mbps with 2MB overhead?", "chain_ids": ["edge-chain-auto-secondary-006-05"], "chain_positions": {"edge-chain-auto-secondary-006-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0838", "title": "Choosing 4-bit or 1-bit Gradients for a 5-Minute 1 Mbps Federated Upload", "topic": "extreme-quantization", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is 4-bit gradient quantization sufficient in the 5-minute, 1 Mbps upload window, or is 1-bit with error feedback required?", "chain_ids": ["edge-chain-auto-secondary-006-05"], "chain_positions": {"edge-chain-auto-secondary-006-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0839", "title": "Diagnosing Federated Quantization Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this failure, and what specific characteristics of the quantized gradients are causing the stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0840", "title": "Diagnosing On-Device LoRA Memory Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does LoRA fine-tuning the 3B model OOM by hitting the 6GB limit, and how would you eliminate the hidden memory bloat?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Assuming that Parameter-Efficient Fine-Tuning (PEFT) inherently guarantees Memory-Efficient Fine-Tuning.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "Diagnose that the root cause is activation memory. Even with frozen base weights, backpropagating gradients to the LoRA adapters requires storing the intermediate forward activations for every layer where an adapter is present.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance."], "correct_index": 2}}, {"id": "edge-0841", "title": "LoRA Memory Footprint Calculation", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much memory is needed for the FP16 7B base, LoRA adapters, and FP32 Adam states for all 3 tasks when adapting W_q and W_v with r=8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0842", "title": "On-Device LoRA Rank Selection Trade-offs", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 7B INT8 LLM on a 32GB Orin, would you choose LoRA rank r=8 or r=64 for W_q and W_v, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0843", "title": "Mitigating Client Drift in Federated Recommendation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you stabilize FedAvg training for the 15MB recommender under extreme non-IID client drift without worsening cellular dropout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0844", "title": "Diagnosing Federated Client Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the production FedAvg loss to diverge after dialect-skewed rounds 15-20, and how would you mitigate the gradient skew?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0845", "title": "Non-IID Variance Skew in FedAvg", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the variance-contribution ratio of athlete Group B to typical Group A in the FedAvg aggregated gradient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0846", "title": "SRAM Budget for Microcontroller Fine-Tuning", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much SRAM is required for the INT8 base, FP32 layer, gradients, Adam state, and workspace, and does it fit in the 256KB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0847", "title": "Diagnosing OOM in Microcontroller On-Device Fine-Tuning", "topic": "mlops-lifecycle", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the memory exhaustion, and how should the training pipeline be re-architected to fit safely within the SRAM constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0848", "title": "Evaluating On-Device Adaptation Strategies for Mobile Keyboards", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 100M-parameter keyboard model under a 256MB, 10-minute nightly budget, should you use full fine-tuning, LoRA, or a local n-gram cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0849", "title": "Diagnosing Thermal Degradation in On-Device FL Scheduling", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the FL training jobs cause overheating, battery drain, and 40% worse p99 app launch latency despite only running when idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0850", "title": "Evaluating Opportunistic On-Device Training Triggers", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is running a 20-minute, 5W training epoch whenever the screen is off for 5 minutes feasible, and what scheduling conditions would you require?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0851", "title": "Smartwatch Gesture Personalization", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the smartwatch gesture model be fully fine-tuned or adapted with a frozen backbone and 50K-parameter head under the 10MB training budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes.", "Assuming the primary bottleneck for on-device training is compute (FLOPs) rather than the memory required to store intermediate activations during the forward pass for backpropagation.", "Implement a personalization layer architecture. By freezing the MobileNet backbone, it runs in standard inference mode, meaning intermediate activations are discarded immediately. Only the final layer's input activations and gradients are cached.", "The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance."], "correct_index": 2}}, {"id": "edge-0852", "title": "On-Device Personalization Memory Footprint", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What exact byte footprint is needed for the MLP head's weights, gradients, Adam states, and batch activations with batch size 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0853", "title": "Debugging Edge Shadow Mode OOM Crashes", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the wake-word devices hard crash after enabling shadow-mode evaluation alongside the baseline under a 1.5MB SRAM limit?", "chain_ids": ["edge-chain-auto-025-03"], "chain_positions": {"edge-chain-auto-025-03": 0}, "chain_tiers": {"edge-chain-auto-025-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0854", "title": "On-Device Shadow Evaluation Trade-offs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can the AR glasses continuously run both gaze-tracking models on every frame, and what shadow-mode architecture meets 50 FPS and 8MB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0855", "title": "Shadow Evaluation Latency Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum shadow-model evaluation frequency (1 out of every N samples) without dropping 100Hz sensor packets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0856", "title": "Top-K Gradient Sparsity for Constrained Edge Uploads", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What minimum gradient sparsity is required for the 10M-parameter FP32 update to fit the 10-second, 960 kbps upload window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0857", "title": "Diagnosing OOMs in Sparse Federated Learning", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did reducing the network payload cause the edge devices to run out of memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0858", "title": "Evaluating Sparse Gradient Updates on Edge", "topic": "extreme-quantization", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you use Top-k sparse gradients to reduce the 40MB dense FL upload over a 1 Mbps link without hurting convergence?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0859", "title": "SRAM Capacity and Activation Tiling", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much SRAM is needed for the 256x256x32 input, 256x256x64 output, and 3x3 weights, and is tiling required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0860", "title": "SRAM Sizing vs Off-Chip DRAM Power", "topic": "real-time-deadlines", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you choose 4MB on-chip SRAM or stream 2MB of weights from LPDDR4 each frame, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0861", "title": "Diagnosing SRAM Thrashing in Edge Accelerators", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of this memory anomaly and its associated power spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The power budget should be managed at the application level by adding sleep intervals between inferences.", "Running inference at lower clock speeds increases total energy consumption because the task takes longer.", "SRAM capacity is exceeded by the combined 2.6MB footprint of weights and activations, forcing continuous eviction to external DRAM.", "The external LPDDR4 DRAM is malfunctioning and sending corrupted weights to the SRAM, causing cache misses."], "correct_index": 2}}, {"id": "edge-0862", "title": "Handling 45-Day Stale FedAvg Updates on Maritime Edge Devices", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does applying the ship's 45-day stale update hurt fleet accuracy, and what reconciliation protocol should replace direct averaging?", "chain_ids": ["edge-chain-auto-001-04"], "chain_positions": {"edge-chain-auto-001-04": 2}, "chain_tiers": {"edge-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0863", "title": "Staleness-Aware Reconciliation in Offline Drones", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is Drone Alpha's discounted aggregation weight after 15 stale rounds, and what total INT8 uplink plus downlink payload is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Weight 0.05, Payload 10MB", "Weight 0.0125, Payload 10MB", "Weight 0.25, Payload 20MB", "Weight 0.0125, Payload 20MB"], "correct_index": 3}}, {"id": "edge-0864", "title": "Maritime LLM LoRA Reconciliation over VSAT", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should the ship reconcile its 45-day v1.0 LoRA adaptation with the global v2.0 base over a 2 Mbps link without uploading 500GB of telemetry?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0865", "title": "Depthwise Separable Convolution FLOP Savings", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For an input with 256 channels and 256 output channels at spatial resolution 56x56, what FLOP reduction factor does MobileNet's depthwise separable convolution provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x — depthwise separable only halves the convolution cost.", "~4x — it removes spatial computation but keeps channel computation.", "~8-9x — it factorizes a k² * C operation into k² + C operations.", "~64x — each channel is processed independently with no cross-channel interaction."], "correct_index": 2}}, {"id": "edge-0866", "title": "CNN Translation Equivariance as Hardware Efficiency", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "From a systems perspective, why does a CNN's translation equivariance and local connectivity translate into lower parameter count and FLOPs than a similarly accurate ViT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0867", "title": "The Low-Rank Decomposition Latency Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 8x low-rank FLOP reduction for the 4096x4096 matrix yield only a 15% latency drop on the Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 8x FLOP reduction is real, but the decomposed skinny GEMMs underutilize Tensor Cores, add intermediate memory traffic and launch overhead, and leave other transformer work unchanged, so wall-clock latency falls only modestly.", "Graph-level optimizations are always superior to kernel-level optimizations because they capture global data flow patterns.", "Equating parameter reduction with proportional speedup. 'We have 8x fewer parameters, so we should see ~8x speedup.' This confuses model size with compute time and ignores hardware execution realities.", "The latency is dominated by data preprocessing on the CPU; moving to GPU-accelerated preprocessing will resolve the bottleneck."], "correct_index": 0}}, {"id": "edge-0868", "title": "The Real-Time Compression Stack Design", "topic": "extreme-quantization", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What ordered compression pipeline, target ratios, and accuracy tradeoffs would meet the 50ms/token budget on the Jetson AGX Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0869", "title": "Latency Spikes from NPU Graph Shattering", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "If all 40 convolutions are delegated but latency is 85ms with 100% CPU, what is the likely root cause and fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0870", "title": "OOM from Implicit Runtime Layout Transposes", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the 3.5GB attention memory spike despite 120MB theoretical activations, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0871", "title": "NPU Fallback via Dynamic Shapes", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does latency jump from 12ms at 224x224 to 180ms with 100% CPU at 256x256 inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0872", "title": "SRAM Spills in Aggressive Fusion", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did fusing Depthwise Conv + 1x1 Conv on the edge NPU increase latency from 5ms to 18ms and triple DRAM use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0873", "title": "JIT Tracing Memory Leak", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does memory grow by 2MB per new variable-length tensor shape on the edge CPU, eventually OOMing at 4GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0874", "title": "Emulation Overhead in Fused DSP Ops", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did fusing Conv2D + Swish onto the DSP increase latency from 20ms to 45ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0875", "title": "Constant Folding Binary Bloat", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did constant folding the static Embedding -> Dense block expand the MCU binary from 1.2MB to 3.2MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0876", "title": "Dynamic Control Flow Graph Breakages", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do complex scenes with more than 50 detected objects make the JIT-traced detector spike from 15ms to 85ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0877", "title": "AR Glasses Power Bottleneck", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you quantize the 1.2B ViT to fit 1.5GB and under 1W while avoiding the low-light INT4 PTQ accuracy collapse?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0878", "title": "Smart Home KV Cache Crisis", "topic": "extreme-quantization", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the KV cache so three 4000-token conversations fit in the 2.5GB RAM budget without breaking attention quality?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system should use standard INT8 quantization across all tokens.", "The system requires an asymmetrical, group-wise KV cache quantization architecture.", "The system should offload the KV cache to the hub's NVMe SSD storage.", "Quantizing the entire KV cache to INT4 linearly across all tokens."], "correct_index": 1}}, {"id": "edge-0879", "title": "Drone Calibration Domain Shift", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does the urban-calibrated INT8 depth model fail in foggy rural areas, and how would you fix it within the 20ms NPU budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0880", "title": "IoT Vibration Dynamic Range", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you preserve 0.01g turbine anomalies on a Cortex-M7 plus symmetric INT8 accelerator with signals ranging up to 100g?", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 4}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Using standard min-max calibration over the whole dataset, which maps the 100g peaks to 127 and completely crushes the critical 0.01g micro-fracture signatures into the 0 bin.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "Standard linear quantization fails here because the dynamic range (10^4) exceeds the representational capacity of linear INT8. To fix this without upgrading the hardware, the system architecture must incorporate non-linear signal compression before NPU execution.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance."], "correct_index": 2}}, {"id": "edge-0881", "title": "Paged Attention Fragmentation Stalls", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does PagedAttention OOM on 8 short 50-token queries with 512-token pages, and what page size would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0882", "title": "Speculative Decoding Acceptance Drop", "topic": "speculative-decoding", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does speculative decoding with K=4 drop from 25 tok/s in chat to 4 tok/s for structured JSON, and how should the runtime react?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable.", "The model should be retrained with fewer parameters to reduce inference time, as model size is the primary driver of latency.", "The root cause is a catastrophic drop in the draft model's token acceptance rate for out-of-domain or highly structured text. Speculative decoding only yields a speedup if the time saved by accepted drafted tokens outweighs the overhead of running the draft model and the verification step.", "Assuming the JSON output is hitting max-token limits or that the target model lacks JSON training data."], "correct_index": 2}}, {"id": "edge-0883", "title": "The Chunked Prefill Starvation", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does a 6000-token summarization request freeze other decodes for 840ms, and how would you prevent that jitter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0884", "title": "KV Cache Quantization Cast Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT8 KV-cache quantization halve memory but raise decode latency to 65ms/token on the mobile NPU, and what would you change?", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 3}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The root cause is the lack of native mixed-precision support for Attention on the NPU, leading to massive dequantization overhead.", "Operator fusion primarily reduces compute time by eliminating redundant arithmetic operations between layers.", "Assuming the INT8 memory is simply slower to read than FP16 memory due to alignment issues.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance."], "correct_index": 0}}, {"id": "edge-0885", "title": "Unpadded Batching Inefficiency", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 50-token prompts take 600ms when batched with one 3000-token prompt, and what batching or attention change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0886", "title": "Speculative Decoding KV Cache Leak", "topic": "speculative-decoding", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding OOM after 10 minutes while standard autoregressive decoding runs the full lecture, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The OOM is caused by memory fragmentation in the Python runtime; running garbage collection before inference will prevent it.", "Assuming the draft model's weights are slowly leaking memory over time due to a PyTorch memory management bug.", "Automatic restart on failure is dangerous for safety-critical systems; manual intervention should always be required.", "The root cause is a logical memory leak in how the PagedAttention block table handles rejected speculative tokens. During speculative decoding, the draft model generates K=4 tokens, and their KV caches are allocated in the memory pool so the target model can verify them."], "correct_index": 3}}, {"id": "edge-0887", "title": "Asymmetric Offload PCIe Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is splitting prefill onto the NPU and decode onto the CPU 16.5s, while pure CPU takes only 2.0s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Assuming the NPU is simply too slow at prefill compared to the CPU's vector extensions.", "The root cause is the massive communication overhead of migrating the KV cache between discrete memory spaces. When prefill runs on the NPU, it generates the KV cache for the 2048-token prompt in the NPU's local memory.", "The low utilization is due to kernel launch overhead on the NPU; fusing all layers into a single kernel will achieve near-peak throughput.", "Batch size should be increased to improve throughput, which will proportionally reduce per-frame latency."], "correct_index": 1}}, {"id": "edge-0888", "title": "Fleet OTA Update Strategy", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you roll out the 150MB dashcam model over 1 Mbps 3G while minimizing cellular cost and avoiding costly bootloops?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0889", "title": "Disconnected Drift Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you detect ultrasound data drift within 7 days without uploading 5MB DICOM images over 20 Kbps links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0890", "title": "Shadow Mode on Tight Edge Memory", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare the new 1.2GB vision model against the production model on the 2GB Jetson Nanos without causing an OOM crash?", "chain_ids": ["edge-chain-auto-025-03"], "chain_positions": {"edge-chain-auto-025-03": 2}, "chain_tiers": {"edge-chain-auto-025-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0891", "title": "Thermal Throttling Runtime Adaptation", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you keep 30 FPS inference reliable when 45°C ambient thermally throttles the NPU from 800MHz to 300MHz?", "chain_ids": ["edge-chain-bucket-realtime-03"], "chain_positions": {"edge-chain-bucket-realtime-03": 4}, "chain_tiers": {"edge-chain-bucket-realtime-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0892", "title": "On-Device Watchdog and Dual-Bank Rollback for AUVs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design on-device watchdog criteria and dual-bank rollback for the AUV fail-safe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0893", "title": "Heterogeneous Edge Model Registry", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build the CI/CD pipeline and model registry to update all three hardware generations without exhausting device storage?", "chain_ids": ["edge-chain-auto-026-12"], "chain_positions": {"edge-chain-auto-026-12": 3}, "chain_tiers": {"edge-chain-auto-026-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0894", "title": "Skewed Edge A/B Testing", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the HVAC A/B test to measure energy savings despite huge climate and insulation variance across homes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run a standard A/B test but increase the sample size to all 100,000 homes to statistically overpower the variance.", "Use a simple pre/post test on the entire fleet without a control group, assuming historical data is a perfect baseline.", "Running the test longer or randomly assigning more users to smooth out the variance.", "Use Switchback (Crossover) Testing or Stratified Sampling based on edge-computed embeddings. In a switchback design, each individual edge device alternates between Model A (Control) and Model B (Treatment) in randomized blocks (e.g., daily or weekly)."], "correct_index": 3}}, {"id": "edge-0895", "title": "Bandwidth-Aware Edge Debugging", "topic": "mlops-lifecycle", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you collect enough raw frames to debug the 10x intruder false-positive spike without saturating 500 KB/s site uplinks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0898", "title": "Harsh Environment Signal Degradation", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 100GbE latency spike from 500ns to 50us during motor acceleration with zero OS packet drops?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0900", "title": "Adversarial Traffic on Edge Dragonfly", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Dragonfly GNN workload congest one 100Gbps inter-chassis link while other optical links sit idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Operator fusion primarily reduces compute time by eliminating redundant arithmetic operations between layers.", "Rollback capability is unnecessary if the model was validated in the cloud, since cloud accuracy guarantees transfer to edge devices.", "Blaming the GNN's graph partitioning and attempting to manually re-partition the graph.", "Identify the failure of static minimal routing in a Dragonfly topology. Dragonfly relies on high-radix routers and sparse global links."], "correct_index": 3}}, {"id": "edge-0901", "title": "ECMP Hash Polarization in Edge Storage", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do four 100Gbps NVMe-oF pulls cap at 100Gbps with one link saturated and three links idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0902", "title": "NPU Underutilization in Depthwise Convolutions", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do MobileNetV2 depthwise convolutions use only about 6% of the 16x16 systolic array, and how would you redesign around it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0903", "title": "FPGA DSP Slice Mapping for INT4", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did INT4 quantization congest routing and spike power, and how would you avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0904", "title": "Dataflow Bottleneck in High-Res Drone NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does the Weight-Stationary dataflow make the 4K CNN hit only 12 FPS and max out DRAM power, and what dataflow should early layers use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0905", "title": "MCU SIMD Alignment for Audio CNNs", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the INT8 keyword-spotting CNN using scalar CMSIS-NN kernels on the Cortex-M4, and how would you hit the 50ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0906", "title": "SmartNIC NPU Arithmetic Intensity Mismatch", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the SmartNIC NPU sit at 4% utilization on a batch-1 LSTM DPI model, and what model change fixes it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0907", "title": "Edge TPU Spatial Tiling Overhead", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is causing the 4 FPS bottleneck, and how would you change tiling and execution to reach 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The severe bottleneck is redundant USB host-to-accelerator transfer from naive spatial tiling; U-Net receptive fields require overlapping halos around each tile, so tiny host tiles repeatedly resend the same pixels and intermediate boundaries.", "The system should be redesigned to offload inference to the cloud, as edge hardware fundamentally cannot meet the latency and accuracy requirements.", "Profiling overhead is masking the true performance; disabling all instrumentation will reveal that the system is already meeting targets.", "Compressing the image data over the USB bus using JPEG, which introduces latency and compression artifacts ruining the super-resolution."], "correct_index": 0}}, {"id": "edge-0908", "title": "Edge GPU LLM KV Cache Thrashing", "topic": "safety-certification", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing L2 thrashing during decode, and how would you reduce KV-cache bandwidth to improve tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["During auto-regressive decoding, the attention mechanism must read the entire KV cache from DRAM for every single generated token. Even though the 1GB KV cache easily fits in the 64GB unified memory, it is vastly larger than the 8MB L2 cache.", "Quantizing the model weights further to INT4, which won't fix the attention phase bottleneck.", "The low utilization is due to kernel launch overhead on the NPU; fusing all layers into a single kernel will achieve near-peak throughput.", "Increasing the number of inference threads will proportionally increase throughput, since modern accelerators scale linearly with thread count."], "correct_index": 0}}, {"id": "edge-0909", "title": "Multi-Camera NPU PCIe Bottleneck", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does latency spike despite 40% NPU utilization, and how would you redesign the video pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0910", "title": "Wearable SRAM Weight Pinning", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you modify the 1.2MB keyword-spotting model and memory layout to meet the 5mW smartwatch budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0911", "title": "On-Device LLM KV Cache Spilling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does generation slow at a 2000-token context, and how would you manage the KV cache to avoid OS kills?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 3}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0912", "title": "AR Glasses Memory Fragmentation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you fix the bad_alloc crashes and 3ms tensor allocation latency despite 1.2GB free RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0913", "title": "ADAS SoC L3 Cache Contention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does CPU LiDAR preprocessing raise ViT latency from 15ms to 42ms, and how would you isolate memory contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0914", "title": "Drone Fused Layer Tiling", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Why are the accelerator MACs only 18% utilized, and what SRAM tiling strategy would reduce the 200ms frame time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0915", "title": "Smart Camera Zero-Copy Pipeline", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign the ISP-to-NPU pipeline to meet the 33ms frame budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0916", "title": "MobileNet vs VGG NPU Bottleneck", "topic": "roofline-analysis", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the lower-GOPS MobileNetV2 run slower than the VGG-like model, and which model should you choose to meet 200+ FPS on the 2 TOPS, 4 GB/s NPU?", "chain_ids": ["edge-chain-auto-018-08"], "chain_positions": {"edge-chain-auto-018-08": 3}, "chain_tiers": {"edge-chain-auto-018-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0917", "title": "Drone Power-Precision Tradeoff", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which accelerator should run obstacle avoidance to maximize flight time, given the 12 GOPS INT8 NPU model and 1% accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0918", "title": "Edge LLM Decode Bandwidth Ceiling", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is decode bandwidth-bound despite 50 TOPS, and what single optimization best improves throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0919", "title": "Operator Fusion for Arithmetic Intensity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "What optimization would let the Transformer encoder meet the 500 QPS SLA on the 2 TOPS, 4 GB/s NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0920", "title": "Quantization Roofline Shift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the FP32 and INT8 inference times, and how does quantization shift the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantization reduces the memory footprint, shifting the workload from memory-bound to compute-bound, dropping inference time to 1.56ms.", "Quantization shifts the workload from compute-bound to memory-bound, dropping inference time to 1.56ms.", "The bottleneck remains memory-bandwidth, but the 4x reduction in data transfer drops inference time to 0.98ms.", "The inference time will not improve because the total MOPS and peak GOPS remain unchanged."], "correct_index": 0}}, {"id": "edge-0921", "title": "Hardware-Aware Batching for Edge NLP", "topic": "safety-certification", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Given an NPU with 5 TOPS compute and 5 GB/s memory bandwidth, should the 4 audio streams run sequentially or as Batch=4 to meet the 15ms SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Calculating compute time only (40 GOPS / 5 TOPS = 8ms) and assuming both batching strategies will easily meet the 15ms SLA.", "Increasing batch size will shift the workload from memory-bound to compute-bound, fully utilizing the available TOPS.", "Batching alters Arithmetic Intensity. For Batch=1, the 20MB weights are loaded 4 separate times, making the workload highly memory-bound and exceeding the SLA due to 16.8ms cumulative memory transfer latency.", "The OOM is caused by memory fragmentation in the Python runtime; running garbage collection before inference will prevent it."], "correct_index": 2}}, {"id": "edge-0922", "title": "Shared Bus Bottleneck in SoC Pipelines", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Will upgrading the NPU to 2x TOPS stop the 30 FPS frame drops, and what should be optimized instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0923", "title": "SRAM vs DRAM Energy Roofline", "topic": "safety-certification", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the 2MB Transformer model fit the 5mW power budget at 100 Hz, and what dominates the energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0924", "title": "Fleet-wide Rollback under Bandwidth Constraints", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you detect the 2-hour latency creep and safely roll back 50,000 dashcams without saturating cellular uplinks?", "chain_ids": ["edge-chain-auto-027-16"], "chain_positions": {"edge-chain-auto-027-16": 3}, "chain_tiers": {"edge-chain-auto-027-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0925", "title": "Ultra-Low Bandwidth Drift Detection", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What observability payload would monitor seasonal data drift within the 10 KB/s satellite uplink limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0926", "title": "Debugging Thermal Throttling Cascades", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you observe thermal-throttle jitter on the glasses without adding polling heat or missing the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0927", "title": "Safe A/B Testing on Storage-Constrained Edge", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you A/B test V2 across 500,000 vacuums when V1 and V2 cannot both fit in 16MB flash?", "chain_ids": ["edge-chain-auto-025-03"], "chain_positions": {"edge-chain-auto-025-03": 1}, "chain_tiers": {"edge-chain-auto-025-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0928", "title": "Tracing Latency in Hybrid Edge-Cloud Pipelines", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you add distributed tracing to isolate the 1200ms P99 bottleneck with under 2ms edge-path overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0929", "title": "Federated Learning Energy Optimization", "topic": "federated-learning", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which federated learning schedule uses less total energy per device to converge, 1 epoch for 200 rounds or 5 epochs for 80 rounds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0930", "title": "AR Glasses Thermal Management via DVFS", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the headset use race-to-sleep or DVFS at 0.6 GHz and 0.6V to meet the 1.2W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0931", "title": "Smart Camera Carbon Payback Period", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the carbon payback period for adding the 20 kg CO2e edge AI modules to the 10,000 traffic cameras?", "chain_ids": ["edge-chain-auto-secondary-017-53"], "chain_positions": {"edge-chain-auto-secondary-017-53": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-53": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0932", "title": "Autonomous Vehicle Dynamic Power Capping", "topic": "real-time-deadlines", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 10% frequency and voltage reduction on the 3 perception nodes absorb the 65W planning spike while maintaining 30 FPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0933", "title": "Drone Edge Compute vs Flight Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which precision, FP16 or INT8, maximizes total powerline-inspection distance for the 150 Wh drone?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0934", "title": "Retail Edge vs Cloud LLM TCO", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which option has lower 3-year TCO per store for 20,000 tokens/day, cloud API serving (0.50 per 1M tokens) or a local edge server (1,500 CapEx, 150W continuous, $0.12/kWh)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0935", "title": "Mobile Speculative Decoding Energy Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the Joules-per-token cost of standard generation versus speculative decoding on the smartphone LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0936", "title": "IoT Edge TPU Sleep State Optimization", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which sleep state, clock-gated or power-gated, minimizes average energy per 200ms anomaly-detection cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0937", "title": "Energy Optimization Strategy Selection", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which saves more energy on a chip with 20 pJ/byte DRAM and 0.5 pJ/FLOP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0938", "title": "Receptive Field vs Model Size Trade-off for Edge Deployment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which achieves the best RF within budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0939", "title": "Designing Fallback for an Autonomous System", "topic": "data-efficiency-selection", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy is best for a safety-critical system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0940", "title": "T4 LLM Decode Batching Tradeoff", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the latency and throughput tradeoff between batch size 1 and batch size 8 for autoregressive decoding on the T4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0941", "title": "Roofline Tradeoff for Standard CNN vs Depthwise on Edge SoC", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which layer should you choose for the 4K 30 FPS vision model, standard CNN or depthwise, under the roofline tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0942", "title": "Edge New 0003", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the 100KB FP32 weight matrix be streamed from Flash or preloaded into SRAM at a 10Hz sampling rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0943", "title": "T4 1080p-to-4K Scaling: Compute vs Memory Bottleneck", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How will FPS scale when the video model moves from 1080p to 4K on the T4, and what bottleneck dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0944", "title": "Sequence Length Scaling Bottlenecks in Edge Vision Transformers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does increasing ViT patch sequence length from 256 to 1024 shift the execution bottleneck on Orin?", "chain_ids": ["edge-chain-auto-017-12"], "chain_positions": {"edge-chain-auto-017-12": 1}, "chain_tiers": {"edge-chain-auto-017-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0945", "title": "INT8 vs INT4 for a 14B LLM on a 16GB T4", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should the 14B LLM use INT8 or INT4 on the 16GB T4, considering memory bandwidth, KV cache, and generation speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0946", "title": "Edge New 0007", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is INT8 still faster than FP32 on the Cortex-M4 despite 4 extra unpack cycles per weight?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0947", "title": "2:4 Sparse INT8 Roofline Tradeoffs on Orin", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What execution tradeoffs should you expect from applying 2:4 sparsity with INT8 on Orin given 200GB/s memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0948", "title": "Dynamic FP16 vs Static INT8 on T4 GPU", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 1 million requests/day on a T4 edge server, should you deploy dynamic FP16 or calibrated static INT8, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0949", "title": "Edge New 0010", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should the 500KB Cortex-M4 model use 50% unstructured pruning or INT8 quantization to fit 256KB SRAM, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0950", "title": "Edge New 0011", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For 4 synchronized 1080p 30 FPS streams on Orin, should you batch all frames or run 4 concurrent single-batch streams?", "chain_ids": ["edge-chain-bucket-realtime-01"], "chain_positions": {"edge-chain-bucket-realtime-01": 3}, "chain_tiers": {"edge-chain-bucket-realtime-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0951", "title": "1kHz Sensor Stream: Immediate Inference vs Batching on Cortex-M4", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "At a 1kHz sensor rate with 4ms inference, should the Cortex-M4 drop samples for immediate inference or batch 10 samples?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0952", "title": "T4 Edge Dynamic Batching for Strict 50ms SLA", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should you choose the dynamic batching timeout and max batch size to meet the 50ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0953", "title": "Orin Robotics CPU vs GPU Image Preprocessing", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For the 60 FPS robotics loop, should image preprocessing stay on the ARM CPU or be offloaded to the GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0955", "title": "Cortex-M4 Duty Cycling Energy for 100ms Inference Intervals", "topic": "duty-cycling", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the IoT node continuously analyze data or duty-cycle to run inference once every 100ms, and what is the energy tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0956", "title": "Edge New 0017", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the Orin AGX robotics app use 15W or 60W mode to meet 30 FPS, and how do their FPS-per-watt efficiencies compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0957", "title": "Orin DVFS Energy per Inference Tradeoff", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Does lowering the Orin GPU clock by 20% reduce energy per inference despite a 25% latency increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0958", "title": "Edge New 0021", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For 10MB frames at 30 FPS on Orin, should the pipeline use explicit CPU-to-GPU copies or unified zero-copy memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0959", "title": "Edge New 0022", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the accelerator keep intermediate tensors in GDDR6 or offload them to host DDR4 over PCIe Gen4 to save VRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0962", "title": "Graceful Degradation Under Thermal Throttling", "topic": "real-time-deadlines", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "When Orin throttles to 100 TOPS but the pipeline needs 120 TOPS at 30 FPS, how should it gracefully degrade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0963", "title": "The KV Cache Checkpoint Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What inference latency overhead comes from checkpointing a 1GB LLM KV cache over PCIe Gen4 every 10 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0964", "title": "T4 Continuous Batching Under a 100ms LLM Latency SLA", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What maximum continuous batch size on the T4 maximizes LLM throughput without exceeding the 100ms per-token SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0966", "title": "T4 Continuous Batching Prefill-vs-Decode Tradeoff", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Should the T4 run a 500ms LLM prefill monolithically or use chunked prefill while decoding at 20ms/token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0967", "title": "Dynamic KV-Cache Management via PageAttention on Edge T4", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should the 16GB T4 implement PageAttention for dynamic KV-cache management, and what tradeoff does it create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0968", "title": "Edge New 0032", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 50GFLOP inference and 5MB image over a 50MB/s 5G link, should the T4 run locally or offload to an H100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0969", "title": "Edge New 0033", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "For 10KB accelerometer windows, should the wearable transmit raw BLE data or run local inference and send a 10-byte result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0970", "title": "Edge New 0034", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For split CNN inference over a fluctuating 10–50MB/s 5G link, should the Orin send the 5MB raw image or a 1MB intermediate tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0971", "title": "Pipeline vs Tensor Parallelism for a 30B Transformer over 1GbE Edge Orins", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 30B transformer across 4 Orins on 1Gbps Ethernet, should you use pipeline parallelism or tensor parallelism, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0972", "title": "Bursty Edge Queue Diagnosis", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "If the 220 ms latency includes the frame's own 20 ms service time, how many frames are queued ahead, how many are in the system, and what arrival-process characteristic likely changed?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0975", "title": "Edge Swarm Feature Map Aggregation over 5G", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the exact compression ratio required to sustain real-time processing and prevent uplink saturation?", "visual": {"kind": "svg", "path": "edge-0975.svg", "alt": "A fanout diagram showing multiple drones funneling data into a single constrained wireless network node, which then connects to an aggregation server.", "caption": "Swarm Uplink Architecture"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0976", "title": "Asymmetric Split-Computing Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency of a single inference and the maximum sustained pipeline throughput in inferences per second?", "visual": {"kind": "svg", "path": "edge-0976.svg", "alt": "A Gantt chart illustrating the staggered execution of M4 compute, Bluetooth transfer, and NPU compute across multiple pipeline cycles.", "caption": "Split-Computing Pipelined Execution"}, "chain_ids": ["edge-chain-auto-secondary-017-21"], "chain_positions": {"edge-chain-auto-secondary-017-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0977", "title": "Edge NPU Queueing Under Bursty Loads", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the utilization of the Hailo-8 NPU and the average queue length using M/D/1 queueing principles?", "visual": {"kind": "svg", "path": "edge-0977.svg", "alt": "A line plot showing the hockey-stick curve of queue length increasing exponentially as NPU utilization approaches 1.0.", "caption": "M/D/1 Queue Length vs Utilization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0979", "title": "Autonomous Vehicle Edge Data Pruning Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the required data drop rate (selectivity) the A17 model must achieve to prevent the SSD from overflowing during the 8-hour shift?", "visual": {"kind": "svg", "path": "edge-0979.svg", "alt": "A stepped bar chart showing data volume decreasing massively from 144 TB Raw Sensor Data down to 16 TB Saved Data after passing through the A17 Neural Engine filter.", "caption": "Data Volume Reduction Pipeline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0980", "title": "Wake-on-VAD Duty Cycling Battery Life", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the average daily energy consumption in milli-Watt-hours (mWh) and expected battery life on a 3000 mWh cell, and does it meet a 1-year target?", "visual": {"kind": "svg", "path": "edge-0980.svg", "alt": "A sleep/wake timeline showing power consumption spikes to 5mW every 100ms with a baseline of 0.1mW.", "caption": "VAD duty cycle power profile."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0982", "title": "Edge Intermittent Power Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the optimal checkpoint interval (in seconds) that balances checkpoint overhead against recompute time to maximize useful training progress?", "visual": {"kind": "svg", "path": "edge-0982.svg", "alt": "A graph showing checkpoint overhead decreasing and failure recompute time increasing as the checkpoint interval grows, with a minimum cost at 268 seconds.", "caption": "Optimal checkpoint interval trade-off."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0985", "title": "Jetson AGX Orin 4K30 Multi-Camera Ingestion", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should the data pipeline from camera ingestion to NPU inference be designed to prevent main LPDDR5 memory bandwidth starvation and frame dropping?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0995", "title": "Analyzing Activation Memory Footprint for MobileNetV3 on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What peak activation memory should you expect for MobileNetV3-Large on Orin at batch 1 and 224x224 input?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0996", "title": "Designing a Memory-Constrained Training Loop for On-Device Fine-Tuning on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the LoRA fine-tuning loop on Orin to fit the 3B VLM within 16GB total memory while achieving at least 5 training steps per minute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0997", "title": "Evaluating Activation Memory Impact of Batch Size Scaling on Jetson Orin Inference", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is YOLOv8-L memory scaling from batch 1 to 8 linear, and what likely causes the OOM at batch 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0998", "title": "Evaluating Hailo-8 vs. Jetson Orin Activation Memory Architecture for Embedded CV", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which platform, Hailo-8 or Jetson Orin, better handles EfficientDet-D2 activation memory at 512x512, and what constraints matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-0999", "title": "Fluency: Explaining Activation Memory vs. Weight Memory Trade-off for Edge Practitioners", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why can a MobileNetV3 with only 5MB of weights consume over 200MB during inference on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1000", "title": "Fluency: Identifying When Gradient Checkpointing Helps vs. Hurts on Constrained Edge Hardware", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why might gradient checkpointing on Orin increase ResNet training time by 80% instead of the expected 33%?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1001", "title": "Implementing Streaming Activation Processing for Video on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How can you stream a 32-frame SlowFast-R50 model so activations fit within 6GB without accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1002", "title": "Mastery: Proving the Memory-Compute Optimality of Gradient Checkpointing Under Edge Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What checkpoint interval k minimizes training step time for the 50-layer ResNet under a 500MB activation budget, and why is it optimal?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 4}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1003", "title": "Mastery: Activation Memory Analysis for Recurrent Models on Ultra-Low-Power Edge Devices", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Does BPTT for Mamba-130M over T=100 fit in 6 MB SRAM, and what should you use if it does not?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1004", "title": "Reducing TensorRT ResNet-50 Residual Activation Memory with INT8 Strict Types", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How can you reduce TensorRT ResNet-50 residual-path activation memory below 1 GB at batch 32, and what batch-size adjustment is needed if INT8 strict typing alone is not enough?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1005", "title": "Realizing Activation Memory Reduction Through Quantization-Aware Fine-Tuning on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does QAT for MobileNetV2 use more activation memory than fp32 training on the edge device, and how can you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1006", "title": "Realizing Tiled Activation Processing for CNNs on Hailo-8 SRAM Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How does Hailo-8 handle the ResNet-34 7x7 stem activation that exceeds SRAM, and what latency tradeoff results?", "chain_ids": ["edge-chain-auto-secondary-003-19"], "chain_positions": {"edge-chain-auto-secondary-003-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1007", "title": "Recall: Defining Activation Memory and Its Relationship to Sequence Length", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What exactly is activation memory, and why does it scale with sequence length for Transformers but not for CNNs?", "chain_ids": ["edge-chain-auto-secondary-003-21"], "chain_positions": {"edge-chain-auto-secondary-003-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1008", "title": "Specifying Activation Memory Constraints for a Multi-Model Inference Pipeline on Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What peak activation memory bounds should you specify so YOLOv8-L, SegFormer-B2, and BEVFusion fit concurrently in 32GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1009", "title": "Recall MobileNet depthwise separable FLOP savings on Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why are MobileNet architectures preferred over ResNets for edge vision on Jetson Orin, and how do depthwise separable convolutions help?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1010", "title": "Fluency: compare Hailo-8 vs Jetson Orin for MobileNet inference efficiency", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the true energy efficiency of this inference, and why is it a mistake to report it as 0.088 TOPS/W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1011", "title": "Fluency: explain MobileNet width multiplier for resource-constrained edge deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What does a 0.5 MobileNetV3 width multiplier mean, how does it affect FLOPs and accuracy, and which value should Hailo-8 use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1012", "title": "Implement MobileNetV3 INT8 quantization for Hailo-8 dataflow compilation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you compile MobileNetV3-Large for Hailo-8 and handle INT8 quantization of its H-swish activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1013", "title": "Jetson Orin Utilization: GFLOPs vs TFLOPS", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the actual GPU utilization for the 21 GFLOPs workload on a 69 TFLOPS Orin profile, and why is 30% wrong?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1014", "title": "Optimize Jetson Orin Depthwise Convolution Latency with TensorRT Fusion", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you optimize the depthwise convolutions specifically for the Jetson Orin's Ampere architecture to reduce this latency?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1015", "title": "Realization: deploy EfficientNet-Lite2 on Jetson Orin for multi-stream video inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you build the GPU preprocessing and batched TensorRT pipeline for 8 1080p streams at 15 FPS on one Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1016", "title": "Compile Shared-Backbone Two-Head MobileNetV3 for Hailo-8 Inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you compile the two-head MobileNetV3 model on Hailo-8 so the 0.18 GFLOP backbone runs only once?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1017", "title": "Choose a Hailo-8 Model for a 5 FPS Solar Wildlife Camera", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which of the three listed models would you choose for the 5 FPS solar Hailo-8 wildlife camera, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1018", "title": "Specification: define CNN model requirements for Hailo-8 production deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which key model constraints should you check for a 20-class, 30 FPS MobileNet-family classifier on Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1019", "title": "Analyzing RAG Feasibility on Jetson Orin for On-Device Retrieval", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Is on-device RAG over 50,000 manuals feasible on Jetson Orin under a 3-second response budget, and what bottleneck dominates?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1020", "title": "Designing a Cascaded Model Pipeline on Jetson Orin + Coral TPU", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should you partition the cascaded classifier pipeline between the Jetson Orin and Coral Edge TPU to stay within a 40W total budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1021", "title": "Explaining Agentic Tool-Use Constraints on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the 8-tool-call agent slow from 4 seconds in the cloud to 47 seconds on the edge?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1022", "title": "Implementing Offline-First RAG with Incremental Index Updates on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you update the 10,000-document FAISS index incrementally so new documents are searchable within 30 seconds and offline operation continues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1023", "title": "Implementing Model Routing Between Coral TPU and Jetson Orin Based on Input Complexity", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a gatekeeper routing layer for simple commands versus complex LLM queries to maintain low latency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1024", "title": "Mastering Context Window Management for Long-Running Edge Agents", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Design a context compression strategy that preserves critical information while keeping prefill under 500ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1025", "title": "Mastering Power-Aware Pipeline Scheduling on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you schedule the three Jetson Orin pipeline stages to keep average power under 35W over any 10-second window?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1026", "title": "Optimizing FAISS Index Type Selection for Jetson Orin Memory Constraints", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which FAISS index should you use for 500,000 768-dim embeddings on Orin, and what IVF cluster count and nprobe would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1027", "title": "Realizing a Two-Stage Compound Pipeline on Coral TPU + Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using an M/D/1 queue at ρ=0.63, what is the average Orin queue length for flagged Coral TPU frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1028", "title": "Recalling Key Constraints of Coral Edge TPU for Compound Pipelines", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the fundamental hardware constraints of the Coral Edge TPU that determine whether this deployment is feasible?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1029", "title": "Recalling Jetson Orin Memory Architecture for Multi-Model Compound Systems", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Does LPDDR5 unified memory on Jetson Orin mean the CPU, GPU, and DLA all share the same physical DRAM, and what are the bandwidth implications for concurrent multi-model execution?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1030", "title": "Theoretical Throughput Calculation for Edge AI Models", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the theoretical FPS for a 219M-MAC INT8 model on a 275 TOPS Jetson Orin, and why is 627 FPS off by 1000x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1031", "title": "Dataset Curation: Design On-Device Data Collection Pipeline for Edge Models", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the on-device quality filtering and upload pipeline for 10,000 Orin devices capturing 100 images per day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1032", "title": "Dataset Curation: Evaluate Centralized vs Federated Data Curation for Edge Fleet", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do centralized upload, federated updates, and a hybrid approach compare for the 5,000-device warehouse data pipeline in terms of data quality, bandwidth, privacy, and model improvement rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1033", "title": "Dataset Curation: Evaluate Active Learning Strategies for Edge-Deployed Models", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compare entropy, margin, and random image selection for active learning on the 200 inference/sec Jetson Orin detector?", "chain_ids": ["edge-chain-auto-secondary-003-22"], "chain_positions": {"edge-chain-auto-secondary-003-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-22": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1034", "title": "Dataset Curation: Implement On-Device Data Augmentation Budget for Jetson Orin", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you implement augmentation for 64-image batches and verify it stays under the 10 ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1035", "title": "Federated Learning Data Strategy for Industrial Edge Fleet", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What federated learning data strategy would you use for 50,000 non-IID Jetson Orin devices to reach 95% defect precision?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1036", "title": "Edge Replay Buffer Design for Continual Learning with 8GB Storage", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the 8GB replay buffer and update schedule to adapt to new product classes without catastrophic forgetting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1037", "title": "Dataset Curation: Optimize Label Efficiency for Edge Model Training", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you choose 10,000 of the 100,000 unlabeled images to maximize mAP gain under the $5,000 labeling budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1038", "title": "Dataset Curation: Optimize On-Device Dataset Compression for Storage-Constrained Edge", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you store 50,000 training examples within the 4GB budget without significant quality loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1039", "title": "Design an Edge-to-Cloud Data Pipeline for Monthly Model Refresh", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the monthly model refresh pipeline from edge collection to cloud fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1040", "title": "Dataset Curation: Specify Data Quality Requirements for Safety-Critical Edge Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What quantitative data quality SLAs would you set for the pedestrian detector to achieve a < 0.1% miss rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1041", "title": "Fault Tolerance: Design Checkpoint Strategy for Jetson Orin Continuous Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design fault tolerance for model state and continuity across power loss, storage corruption, and model degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1042", "title": "Fault Tolerance: Evaluate Warm vs Cold Recovery for Edge Device Failures", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the failed Orin device, how do cold cloud recovery and warm NVMe snapshot recovery compare in latency and state freshness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1043", "title": "Fault Tolerance: Evaluate Redundancy Strategies for Mission-Critical Edge Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which redundancy architecture meets the 99.99% uptime requirement for real-time inspection, and how do they compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1044", "title": "Fault Tolerance: Fluency — MTBF Calculation for Jetson Orin Fleet", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 1,000 outdoor Orin devices with 40,000-hour MTBF, what monthly failure rate and spare inventory are needed for 98% availability?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1045", "title": "Fault Tolerance: Fluency — Checkpoint Size for Edge Online Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What should each YOLOv8-medium LoRA checkpoint include, and what are its size, write time, and 30-day storage use?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 1}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1046", "title": "Edge Monitoring for 2,000 Jetson Orin Devices with 5-Minute Alerting", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you monitor 2,000 Orin devices to alert within 5 minutes on latency, accuracy, or power degradation, and what bandwidth is required?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 2}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1047", "title": "Fault Tolerance: Implement OTA Model Update with Rollback for Jetson Fleet", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the OTA model update protocol with automatic rollback so a bad 200MB update affects less than 1% of the 500-device fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1048", "title": "Fault-Tolerant Architecture for 1,000 Online-Learning Orin Vision Systems", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What complete fault-tolerance architecture would you design for 1,000 online-learning Orin vision systems to achieve 99.95% uptime?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 4}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1049", "title": "Fault Tolerance: Mastery — Recovery Time Analysis for Edge Learning System", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you analyze the three Orin failure recovery paths and improve each to meet a 5-minute RTO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1050", "title": "Fault Tolerance: Optimize Checkpoint Frequency for Battery-Powered Edge Device", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much battery does checkpointing 2MB LoRA weights every 5 minutes consume, and is it worth reducing the frequency?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 2}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1051", "title": "Fault Tolerance: Realize Checkpoint Storage Architecture for 200-Device Jetson Fleet", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What storage tiers would you specify for 30-day LoRA checkpoint history, 1-hour cloud backup, and 5-minute recovery across 200 Orin devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1052", "title": "Fault Tolerance: Realize Fault Detection Latency for Real-Time Edge Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you implement fault detection on the 60 fps system to catch inference failures within 500ms with alerting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1053", "title": "Fault Tolerance: Recall — What is MTBF and Why Does It Matter for Edge ML?", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is MTBF, how should it determine checkpoint frequency for online learning, and what is the optimal checkpoint interval formula?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 0}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1054", "title": "Fault Tolerance Specification for Safety-Critical Edge Vision", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What reliability specification would you write to meet SIL-2, including metrics, checkpoints, detection, and manual override?", "chain_ids": ["edge-chain-auto-024-05"], "chain_positions": {"edge-chain-auto-024-05": 3}, "chain_tiers": {"edge-chain-auto-024-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1055", "title": "Kernel Fusion: Recall — Why is Kernel Fusion Critical for Jetson Orin?", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does kernel fusion matter significantly more on an edge SoC than on an H100, and what is the effective memory bandwidth constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1056", "title": "Kernel Fusion: Design Efficient Inference Kernel for MobileNet on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the activation size of a 112x112x16 depthwise-conv output in float32, and why is 6.4MB wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1057", "title": "Kernel Fusion: Evaluate ONNX Runtime vs TensorRT Fusion on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do ONNX Runtime with CUDA EP and TensorRT compare for ResNet-50 on Jetson Orin in latency, throughput, and power efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1058", "title": "Kernel Fusion: Evaluate Depthwise Convolution Fusion Strategies on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the 56x56x32 MobileNetV2 inverted residual block, which fusion strategy is viable and what memory and latency savings should you expect?", "chain_ids": ["edge-chain-auto-025-12"], "chain_positions": {"edge-chain-auto-025-12": 3}, "chain_tiers": {"edge-chain-auto-025-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1059", "title": "Jetson Orin INT8 Compute Time for YOLOv8-Nano", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the theoretical INT8 compute time for 8.7B ops on a 275 TOPS Jetson Orin, and what TOPS/GOPS unit mistake should you avoid?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1060", "title": "Kernel Fusion: Optimize Inference Kernel for Depthwise Separable Convolution", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "With DWConv taking 35% of time at 2 OPS/byte versus a 4 OPS/byte ridge point, what is the bottleneck and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1061", "title": "Kernel Fusion: Optimize Int8 Fusion Pipeline for Hailo-8 NPU", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you reduce YOLOv5s latency from 8ms to 5ms on Hailo-8 by exploiting compile-time dataflow fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1062", "title": "Kernel Fusion Impact for EfficientDet on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For EfficientDet-D0 on Jetson Orin, how much latency is saved when TensorRT fuses the ONNX graph from 236 nodes to 42 nodes, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1063", "title": "Kernel Fusion for BERT-Tiny on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 6-layer BERT-tiny at batch 4 and sequence 128 on Jetson Orin, how do unfused and fused kernels differ in bandwidth needs and speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1064", "title": "Kernel Fusion: Specification — Define Fusion Requirements for Safety-Critical Edge Vision", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What kernel fusion specification would you require for the forklift pedestrian detector to meet 60 fps under 50W with safety validation?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1065", "title": "Latency Decomposition: Compare Jetson Orin vs. Hailo-8 for Object Detection Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the realistic Platform A inference time for a 6.5 GFLOP INT8 YOLOv8-small model, and why is an estimate of 59ms mathematically wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1066", "title": "Latency Decomposition: Compare TFLite vs. TensorRT Inference Latency on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For MobileNetV3-Large on Jetson Orin, how do TFLite GPU delegate and TensorRT FP16 compare in end-to-end inference latency and overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1067", "title": "Latency Decomposition: Compute Prefill Latency for Edge LLM on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical and 60%-utilization prefill latency should you estimate for a 1B LLM processing a 128-token prompt in INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1068", "title": "Latency Decomposition: Full Pipeline Latency Audit for Autonomous Drone on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Assuming the accelerator provides 110 effective TOPS, can the sequential pipeline meet the 30ms budget, what is the bottleneck, and what fix is needed?", "chain_ids": ["edge-chain-auto-secondary-001-05"], "chain_positions": {"edge-chain-auto-secondary-001-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1069", "title": "Latency Decomposition: Diagnose and Fix Jetson Orin Inference Latency Spike", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What likely causes 8ms-to-45ms latency spikes on the 2 GOP INT8 safety classifier, and what quantified fix would you apply?", "chain_ids": ["edge-chain-auto-secondary-001-05"], "chain_positions": {"edge-chain-auto-secondary-001-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1070", "title": "Latency Decomposition: Size E2E Latency for Smart Camera on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the end-to-end latency budget for the 1080p Jetson Orin smart camera pipeline, and can it process every frame at 30fps?", "chain_ids": ["edge-chain-auto-secondary-001-05"], "chain_positions": {"edge-chain-auto-secondary-001-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1071", "title": "MLOps Lifecycle: Compare OTA Update Strategies for Fleet of Jetson Orin Edge Devices", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For 5000 Jetson Orins, how do 500MB full-model OTA updates compare with 50MB delta updates in bandwidth cost, update time, and fleet-wide failure risk?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 1}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1072", "title": "MLOps Lifecycle: Compare A/B Testing at Edge vs. Cloud for Model Updates", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which A/B testing strategy (edge-side deployment or cloud-side streaming) is more viable for 1000 Jetson Orin devices, considering bandwidth and operational costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1073", "title": "Edge Fleet Rollout, Rollback, and Bandwidth Sizing", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the MLOps lifecycle for 10,000 edge units, and quantify daily update bandwidth, per-vehicle storage, and monitoring volume?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 3}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1074", "title": "MLOps Lifecycle: Diagnose Edge Fleet Model Drift and Quantify Retraining Trigger", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What likely caused the customer-count mAP drop from 0.87 to 0.79, and what quantitative monitoring and retraining trigger would catch it earlier?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1075", "title": "MLOps Lifecycle: Size Storage and Bandwidth for Edge Model Registry", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size the registry storage, per-device storage, update bandwidth, and LTE transfer time for 2000 Jetson Orins with 150MB models?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 0}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1076", "title": "MLOps Lifecycle: Size Weekly Hailo-8 Updates Across Registry, eMMC, Bandwidth, and Endurance", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size cloud storage, per-device eMMC impact, monthly bandwidth, and eMMC endurance for 500 devices with weekly 80MB updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1077", "title": "MLOps Lifecycle: Specify Monitoring and Alerting System for Edge CV Fleet", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What metrics, sampling rates, alert thresholds, and data volumes would you use to monitor 1000 Jetson Orin vision devices at remote sites?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1078", "title": "Model Format Conversion: Compare ONNX vs. TensorRT for Jetson Orin Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do ONNX-to-TensorRT INT8 and TorchScript direct inference compare for ResNet-50 on Jetson Orin in latency, complexity, and portability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1079", "title": "Model Format Conversion: Compare TFLite vs. ONNX Runtime for ARM Cortex on Jetson", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For MobileNetV2 on Jetson Orin Cortex-A78AE CPUs, how do TFLite XNNPACK and ONNX Runtime ACL compare for FP16 latency, memory, and op coverage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1080", "title": "Model Format Conversion: Convert a PyTorch Detector to a Static INT8 Dataflow Engine", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What deployment conversion workflow would you use to export the detector, handle unsupported operators, measure INT8 accuracy loss, and estimate compiled engine size and theoretical throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1081", "title": "Model Format Conversion: Diagnose TensorRT Conversion Failure for Custom Attention Ops", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What causes the TensorRT INT8 conversion failure on ScaledDotProductAttention for ViT-Base, how would you fix it, and what speedup should result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1082", "title": "Model Format Conversion: Size TensorRT Engine Storage for Multi-Model Jetson Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the approximate TensorRT engine-file sizes and NVMe file-transfer time for the three Jetson Orin models, and what extra information is required before claiming runtime memory fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1083", "title": "Model Format Conversion: Specify ONNX Conversion Pipeline with Validation for Edge", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What CI/CD stages and pass/fail criteria would you specify for this PyTorch-to-ONNX conversion pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1084", "title": "Model Size Estimation: Analyze Jetson Orin ResNet-50 INT8 Batch-1 Overhead", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT8 quantization of ResNet-50 on Jetson Orin yield only 1.3x speedup instead of 2x, using compute and bandwidth bottleneck analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1085", "title": "Model Size Estimation: Design Memory Layout for Multi-Model Edge Vision System", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you lay out memory so all four Jetson Orin vision models stay resident and switch in under 5ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1086", "title": "Model Size Estimation: Compare YOLOv8-l FP32 FP16 and INT8 TensorRT Memory on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much memory does YOLOv8-l use on Jetson Orin in FP32, FP16, and INT8 TensorRT, and how many concurrent instances fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1087", "title": "Model Size Estimation: Compare MobileNetV3-Large and EfficientNet-B3 INT8 Edge Memory", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do MobileNetV3-Large and EfficientNet-B3 compare in INT8 TensorRT memory, latency, and accuracy per MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1088", "title": "Model Size Estimation: Estimate LPDDR5 Memory for a YOLOv8-n INT8 TensorRT Pipeline", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much LPDDR5 memory does a YOLOv8-n model (3.2M params, INT8 TensorRT) need for this pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1089", "title": "Model Size Estimation: Master Full Memory Audit for Edge LLM Deployment on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the INT8 KV-cache memory per token for the 1B LLM with 32 layers, 32 KV heads, and head_dim 64?", "chain_ids": ["edge-chain-auto-secondary-003-27"], "chain_positions": {"edge-chain-auto-secondary-003-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1090", "title": "Model Size Estimation: Master Memory Tradeoff for Precision vs. Context Length on Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the FP16 KV-cache memory per token for a 1B LLM with 32 layers, 32 KV heads, and head_dim 32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1091", "title": "Model Size Estimation: Diagnose Memory Pressure Causing Swapping on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the Jetson Orin with 28GB FP16 weights on 32GB LPDDR5 start swapping and slow down 50%, and what quantified fix eliminates it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1092", "title": "Model Size Estimation: Diagnose GPU Memory OOM on Jetson Orin for Batch Inference", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is causing the OOM when moving from batch 4 to 8, and what maximum batch size should fit on the Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1093", "title": "Model Size Estimation: Sizing LPDDR5 Memory for a 4K Multi-Model CV Pipeline on Jetson Orin", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What LPDDR5 memory budget should you allocate for this 4K camera, 3-model, H.265 pipeline on the 32GB Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1094", "title": "Estimate Host RAM, SRAM Fit, and PCIe Model-Switch Time for Sequential Edge Inference", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much host RAM is needed, do the three models fit in SRAM, and what is the model-switching time via 5GB/s PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1095", "title": "Maximum Safe INT4 Edge LLM Configuration for 16GB LPDDR5", "topic": "model-size-estimation", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What LLM family, precision, parameter count, KV-cache budget, and Time Per Output Token (TPOT) would you deploy within the 16GB constraints?", "chain_ids": ["edge-chain-auto-secondary-003-27"], "chain_positions": {"edge-chain-auto-secondary-003-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1096", "title": "MLOps Lifecycle: Optimize OTA Update Pipeline for Hailo-8 Fleet", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How can the 10,000-device Hailo-8 OTA update be reduced from 100 hours to under 1 hour without increasing CDN cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1097", "title": "Size Hailo HEF Storage and SRAM for Dual-Model Edge Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HEF storage and SRAM are required, do both Hailo-8 models fit simultaneously, and what switching overhead remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1098", "title": "Edge Queueing Theory Recall: Little's Law at the Network Edge", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Using Little's Law, what is the mean latency per frame, and what does it imply for real-time video at 30 FPS?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 0}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1099", "title": "Edge Queueing Theory Analyze: Why Jetson Orin Pipeline Stalls", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using queueing theory, why can the 4-camera pipeline have 800ms latency at only 40% GPU utilization, and what stage is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1100", "title": "Edge Queueing Theory Analyze: Tail Latency on Battery-Constrained Jetson", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the M/M/1 utilization and P99 latency at 60W and 25W, and what happens if throttling drops service below arrivals?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1101", "title": "Edge Queueing Theory Design: Multi-Model Pipeline on Jetson Orin", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using the correct M/D/1 formula, what is the queue wait for the 8 ms detection stage at 30 FPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1102", "title": "Edge Queueing Theory Design: Burst Traffic Handling with Jetson Buffer", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How large should the burst buffer be, what overflow policy should it use, and what maximum queue latency occurs during the 80 events/s bursts?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 2}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1103", "title": "Edge Queueing Theory Design: Hailo-8 vs Jetson Orin Queueing Analysis", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For λ=50 detections/s and a 5 GOPS model, what are throughput, utilization, queue wait, and power efficiency on Hailo-8 versus Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1104", "title": "Edge Queueing Theory Diagnosis: Jetson Orin Pipeline Latency Spike", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What root cause explains the 500 ms obstacle-detection spikes despite 35% GPU utilization, and how much latency can the fix remove?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1105", "title": "Edge Queueing Theory Diagnosis: Hailo-8 Throughput Under Real Traffic", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Hailo-8 retail pipeline deliver only 8 FPS on a 30 FPS camera, and how large is the gap to expected MobileNetV2 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1106", "title": "Edge Queueing Theory Diagnosis: Queue Overflow in Edge Gateway", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What explains the observed 25% drop rate in this overloaded edge gateway, and how should the system be resolved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1107", "title": "Edge Queueing Theory Evaluation: Jetson Orin vs Hailo-8+Pi Latency at Low Load", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For option B at λ=20/s and μ=40/s, what is the correct M/D/1 queue wait term before comparing latency and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1108", "title": "Edge Queueing Theory Evaluation: Coral Edge TPU vs Hailo-8 for TinyML Inference", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the Coral Edge TPU at λ=1000/s and μ=1333/s, what is the correct M/D/1 queue wait Wq before comparing to Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1109", "title": "Edge Queueing Theory Evaluation: Static vs Dynamic Batching on Jetson Orin", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 40 req/s, should the Jetson Orin use static batch=8 with a 200 ms wait or dynamic batching with a 20 ms timeout?", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 3}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1110", "title": "Edge Queueing Theory Fluency: Little's Law Mental Math for Edge Systems", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the mean queue-plus-service latency, accelerator queue length and mean wait, and maximum service time for the three quick edge queueing estimates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1111", "title": "Edge Queueing Theory Fluency: Back-of-Envelope Edge Queue Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the Jetson Orin handle 4x30 FPS ResNet-50 streams, and what are its capacity, utilization, queue wait, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1112", "title": "Hailo-8 M/M/1 Latency at 70, 90, and 99 Percent Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are Wq, W, and P99 for Hailo-8 vehicle detection at 70%, 90%, and 99% utilization, and where is the latency knee?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1113", "title": "Edge Queueing Theory Implement: Service Rate from Latency Spec", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the face-recognition pipeline runs at mu=200/s and rho=0.5, what M/D/1 Wq and total latency verify the P99 < 50ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1114", "title": "Edge Queueing Theory Implement: Little's Law for Edge Buffer Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the gateway stable, and what buffer size in frames and MB satisfies Little's Law for the 100ms latency maximum?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 1}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1115", "title": "Hailo-8 Per-Chip Arrival Rate and Utilization for Corridor Safety Cameras", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "For the Hailo-8 option, using 2 cameras per chip, 30 FPS per camera, 26 TOPS INT8 capacity, and a 3 GOPS/frame YOLO-pose model, what are the per-chip arrival rate and utilization?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 4}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1116", "title": "Jetson Orin Priority Queueing for Mixed Heavy and Light Robot Inference", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How many Jetson Orins are needed for the mixed heavy/light workload, can priority queues meet the 500 ms P99 SLO, and does power fit the 200W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1117", "title": "Edge Queueing Theory Optimization: Reduce Jetson Orin P99 by Model Cascading", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is the two-stage small/large model cascade feasible for P99 latency, and how much Stage 2 capacity is needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1118", "title": "Edge Queueing Theory Optimization: Hailo-8 Throughput via Request Coalescing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "If adjacent frames are coalesced in pairs, what are the new utilization and P99 latency, and does it meet the 200ms SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1119", "title": "Edge Queueing Theory Optimization: Memory-Aware Scheduling on Jetson Orin", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a decode queue with lambda=8/s and mu=9.7/s, what is the correct wait time and why does caching matter?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1120", "title": "Edge Queueing Theory Realization: Concrete Queue Depth for Coral Edge TPU", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many input requests and bytes must be buffered to meet the 500ms P99 SLO, and can the 512KB Cortex-M7 SRAM hold them?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 2}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1121", "title": "Jetson Orin Five-Stage Pipeline Mean Latency", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the mean G/D/1 latency for the pipeline, and which stage should be optimized first?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1122", "title": "Edge Queueing Theory Realization: Jetson Orin Memory vs Compute Bottleneck", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Concretely determine whether a Jetson Orin serving EfficientNet-B4 at batch=1 is compute-bound or memory-bandwidth-bound at 100 req/s, and what are the resulting M/D/1 metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1123", "title": "Robotic Arm Real-Time Inference Hardware and Scheduling Policy", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Should the robotic arm use Jetson Orin or Hailo-8 for the 100Hz vision task, and what queueing and scheduling policy guarantees the 5ms inference deadline under the stated WCET assumptions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1124", "title": "Smart Meter Anomaly Detector Queue Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What buffer size, scheduling policy, latency SLO, and power budget should the Hailo-8 smart-meter anomaly detector use?", "chain_ids": ["edge-chain-auto-018-05"], "chain_positions": {"edge-chain-auto-018-05": 3}, "chain_tiers": {"edge-chain-auto-018-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1125", "title": "Edge Queueing Theory Specification: Multi-Tenant Edge Inference SLO", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What priority levels and queueing policy should the shared Jetson Orin use for tenants A, B, and C, and do their P99 SLOs hold?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 3}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1126", "title": "Edge Systolic Array Analyze: Why Hailo-8 Dataflow Wins for Computer Vision", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Hailo-8 architecture achieve significantly better energy efficiency than the GPU for this workload, and how much DRAM traffic is avoided?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1127", "title": "ResNet-50 Roofline Throughput for Batch 1 and Batch 64 on Jetson Orin", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is batch=1 ResNet-50 on Jetson Orin memory-bound, what throughput does Roofline predict, and how does batch=64 change it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1128", "title": "YOLOv8-Nano First Convolution Tiling and Latency on Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should the first YOLOv8-nano convolution be tiled for Jetson Orin DLA SRAM, and what latency should the two DLA cores achieve?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 3}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1129", "title": "Edge Systolic Array Design: Hailo-8 Dataflow for Video Analytics", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What TDM schedule should the accelerator use for the three vision models, and what latency does each model see?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1130", "title": "Output-Stationary MobileNetV2 Depthwise vs Pointwise Convolution on Jetson DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should an output-stationary dataflow run the MobileNetV2 depthwise convolution, and how does it compare with 1x1 pointwise convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1131", "title": "Edge Systolic Array Evaluation: Hailo-8 vs Coral TPU Dataflow Efficiency", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which chip achieves better Model FLOPs Utilization (MFU) deploying EfficientNet-Lite0 at batch size 1, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1132", "title": "EfficientNet-B0 32x112x112 Depthwise Convolution WS vs RS SRAM Access", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 32-channel 112x112 3x3 depthwise conv, how do WS and RS dataflows compare in SRAM access and energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1133", "title": "Weight-Stationary vs Output-Stationary QK Attention", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For QK^T with seq=64 and head_dim=64, is output-stationary or weight-stationary dataflow better, and what are the memory access counts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1134", "title": "Edge Systolic Array Fluency: Arithmetic Intensity Estimation for Edge Models", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Without looking up formulas, estimate the arithmetic intensities for MobileNetV2, the 3x3 conv, and the 1x1 conv on Hailo-8, and state whether they are memory or compute bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1135", "title": "Edge Systolic Array Fluency: Roofline Model Mental Math for Jetson", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the three Jetson Orin operations, are they memory or compute bound and what limiting throughput or latency do you estimate?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 1}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1136", "title": "YOLOv5s FPS and Energy on Hailo-8, Jetson Orin, and Coral TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What FPS and energy per frame can YOLOv5s achieve on Hailo-8, Jetson Orin, and Coral, and which bottleneck limits each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1137", "title": "MobileNetV3 28x28 DW+PW Memory and Latency Bottleneck on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the MobileNetV3 28x28 DW+PW block on Hailo-8, what are the memory accesses, latency, throughput, and bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1138", "title": "Edge Systolic Array Mastery: Full EfficientNet Roofline on Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For EfficientNet-B0 on Orin DLA, which layers are memory or compute bound, what MFU do you expect, and what should run on GPU vs DLA?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 4}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1139", "title": "ViT-Tiny on Jetson Orin GPU vs DLA Placement, Latency, and Energy", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "For ViT-Tiny on Jetson Orin, should attention, FFN, and patch embedding run on GPU or DLA, and what latency and energy meet 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1140", "title": "Edge Systolic Array Optimization: Fix Memory-Bound Conv on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What bottleneck explains the CNN's 11.5% MFU, and how do channel grouping and larger spatial resolution change its intensity and speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1141", "title": "Edge Systolic Array Optimization: DLA vs GPU Load Balancing on Jetson", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can MobileNetV2 move from GPU to DLA while maintaining 2000 FPS, and what power savings and maximum DLA FPS result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1142", "title": "Hailo-8 INT8 ResNet-18 Memory Footprint and Latency", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For batch-1 inference, using weights plus peak activation memory only, what are the FPS, latency, and memory footprint reduction for Hailo-8 INT8 versus the theoretical FP16 comparison?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1143", "title": "EfficientDet-Lite2 Batch Throughput on Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For EfficientDet-Lite2 on Orin DLA at batch sizes 1, 4, and 16, what FPS, latency, energy, and bound regime do you get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1144", "title": "Edge Systolic Array Recall: Systolic Array Operation Principle", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the key systolic-array principle, and why does the Hailo-8 deliver significantly better TOPS/W than the ARM Cortex-A78 for INT8 matmul?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1145", "title": "Orin DLA SRAM Tiling for 3x3 INT8 Convolution", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What safe 16-aligned output tile fits this 3x3 conv in Orin DLA SRAM, how should output channels be split, and what are the per-core byte counts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1146", "title": "Edge TCO Analyze: Total Cost of Deployment at Scale", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the 5-year TCO of 500 edge AI units compared to a cloud-based alternative at $0.50/hr?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1147", "title": "Dataflow Accelerator vs Edge SoC Cost Per Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which platform provides a cheaper per-inference cost when including hardware amortization and power consumption over 3 years?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1148", "title": "Edge TCO Analyze: Power Budget Dominates Edge IoT Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For 10,000 Coral Edge TPU units over 5 years, what percentage of TCO comes from hardware, power, connectivity, maintenance, and software?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1149", "title": "Edge TCO Design: Optimize Jetson Orin Deployment for 3-Year ROI", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct annual power cost for one 60W Jetson Orin running 24/7 at $0.10/kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1150", "title": "Edge TCO Design: Hailo-8 ROI for Smart Agriculture", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the break-even point and 5-year ROI for deploying Hailo-8 edge inference on 1000 farms versus cloud inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1151", "title": "Edge TCO Design: Size Fleet Budget for Autonomous Vehicle Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the 5-year total cost of ownership (TCO) for the inference compute hardware across the 10,000-vehicle fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1152", "title": "Edge TCO Diagnosis: Why Edge Deployment Is Losing Money", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 100-unit smart parking deployment losing money, and what monthly revenue per unit is needed to break even?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1153", "title": "Edge TCO Diagnosis: Hailo-8 Deployment Failure Root Cause", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hidden costs explain the Hailo-8 fleet overrun, and what is the true 1-year TCO versus the expected TCO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1154", "title": "Edge TCO Diagnosis: Connectivity Cost Surprise in IoT Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 4G overage, how much does it cost annually, and how should the 2,000-unit Jetson fleet fix it?", "chain_ids": ["edge-chain-auto-secondary-001-10"], "chain_positions": {"edge-chain-auto-secondary-001-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1155", "title": "Edge TCO Evaluation: Coral TPU vs Cloud Inference for Low-Volume Deployments", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the beehive monitor at 1 inference per minute, what is the 3-year TCO for edge versus Lambda, and what is the daily break-even volume?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1156", "title": "Edge TCO Evaluation: 3 Edge Hardware Options for Retail Analytics", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the correct 3-year power costs for the 7.5W Hailo-8+Pi5 and 4W NCS2+Pi4 options at $0.10/kWh?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1157", "title": "Edge TCO Evaluation: Batch vs Real-Time Inference Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the 5-year TCO premium for real-time Jetson Orin inference versus overnight Hailo-8+Pi5 batch inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1158", "title": "Jetson vs Coral 3-Year TCO Break-Even", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year TCOs for the Jetson Orin and Coral Edge TPU fleets, and at what daily inference volume does the Coral fleet beat the cloud?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1159", "title": "Edge TCO Fluency: Rapid Cost-Per-Inference Estimation", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the 3-year total cost and cost-per-inference for these systems, and how do they rank by cost-efficiency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1160", "title": "Edge AI Camera 3-Year TCO With Power and Maintenance", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the corrected 3-year TCO for one edge system after including power and 5% annual maintenance?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1161", "title": "Edge TCO Implement: Calculate Cost Per Inference for Edge Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the checkout device running 5 inferences/s for 12hr/day, what is the 3-year TCO and cost per inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1162", "title": "Edge TCO Implement: Fleet Power Cost Optimization", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 1000-unit Hailo-8+Pi5 fleet, what are annual power cost at $0.10/kWh, duty-cycling savings if off-hours sleep power is 1W per unit for 12 hours/day, and 5-year NPV at a 5% discount rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1163", "title": "Edge TCO Implement: Hailo-8 vs Cloud Break-Even Analysis", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact daily inference volume break-even point between the Hailo-8+Pi5 and AWS Rekognition over 3 years?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1164", "title": "Hospital Hailo-8 Fleet 5-Year Power Cost", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct 5-year power cost for 50 Hailo-8 edge devices operating at 7.5W continuously, assuming $0.10/kWh?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 4}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1165", "title": "Edge TCO Mastery: Full Lifecycle Cost Model for Industrial Edge AI", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the 5-year TCO, ROI, and payback period for the 200-unit predictive maintenance deployment?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 3}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1166", "title": "Edge TCO Mastery: Hardware Replacement vs Cloud Migration Decision", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the correct 4-year power cost for 300 replacement Orin units at 60W and $0.10/kWh?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1167", "title": "Edge TCO Optimization: Power Management for Always-On Edge Systems", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the 5-year power cost savings from DVFS reducing to 15W at idle, and their NPV at a 5% discount rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1168", "title": "Edge TCO Optimization: Model Compression for Edge Cost Reduction", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the correct 5-year power cost for 200 Jetson Orin units at 60W and $0.10/kWh before compression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1169", "title": "Edge TCO Optimization: Optimize Connectivity Costs Dominating TCO", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you cut the 1000-unit fleet's 4G connectivity cost by at least 90%, and what are the 5-year savings and payback?", "chain_ids": ["edge-chain-auto-secondary-001-10"], "chain_positions": {"edge-chain-auto-secondary-001-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1170", "title": "Edge TCO Optimization", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What power-management plan will cut the 500-store Hailo-8 fleet's annual power cost by 60%, and what savings result?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1171", "title": "Edge TCO Realization: Concrete 1-Year Cost Breakdown for Edge AI System", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the 1-year cost breakdown across amortized hardware, power, maintenance, and connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1172", "title": "Edge TCO Realization: Scale-Up vs Scale-Out for Edge Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you size the Jetson Orin and Hailo-8 options for 500 TOPS INT8, compute 3-year TCO, and compare failure modes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1173", "title": "Solar Sizing and TCO for a Remote Edge Weather Station", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What panel wattage, LiFePO4 battery capacity, and 5-year TCO would you size for the 24/7 remote station versus grid power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1175", "title": "Edge TCO Recall: CapEx vs OpEx Tradeoff for Edge AI", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "For 100 Jetson Orin units, what are first-year CapEx, annual power OpEx, and why might a cash-constrained startup still prefer cloud OpEx?", "chain_ids": ["edge-chain-auto-secondary-001-11"], "chain_positions": {"edge-chain-auto-secondary-001-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1176", "title": "Edge TCO Specification: Design $50K Budget Deployment for Smart Building", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What complete edge AI deployment would meet 50 cameras at 30 FPS with P99 <100ms under the $50K 3-year budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1177", "title": "Edge TCO Specification: Design Lifecycle Cost Model for 5-Year Edge Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the annual fleet power cost for 100 edge devices at 60W and $0.10/kWh?", "chain_ids": ["edge-chain-auto-secondary-001-12"], "chain_positions": {"edge-chain-auto-secondary-001-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1179", "title": "Edge Transformer Cost Evaluation: Quantized vs Full Transformer on Jetson", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which is better for a real-time document triage system requiring P99 < 50ms, considering throughput, memory footprint, and latency at batch=1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1180", "title": "Edge Transformer Cost Evaluation: Distilled vs Full Transformer on Hailo-8", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which should run on Hailo-8 for batch=1 sentiment, DistilBERT or BERT-base, and what throughput and energy-cost tradeoffs result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1181", "title": "TinyBERT Inference Bottleneck on Jetson Orin DLA", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the stated estimates, is TinyBERT on Jetson Orin DLA compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1182", "title": "Jetson Orin Phi-3-Mini-4K Concurrency and Latency Analysis", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Can one Jetson Orin handle Phi-3-Mini-4K INT4 for 10 simultaneous users with <2s 200-token responses, and what is its annual cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1183", "title": "Edge Transformer Cost Mastery: Quantization Tradeoff for Edge Deployment", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which quantization levels are Pareto-optimal for LLaMA-3-8B on Jetson Orin under P99 <1s and <3% accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1184", "title": "BERT-base Pruning on Jetson Orin: TCO and Throughput", "topic": "tco-cost-modeling", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is a $30K project to prune BERT-base from 110M to 55M parameters on 100 Jetson Orins justified, and what gains result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1185", "title": "Edge Transformer Cost Optimization: Knowledge Distillation for Edge Deployment", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What throughput, memory, accuracy, and annual savings would distilling BERT-large to TinyBERT deliver across 50 Jetson Orins?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1186", "title": "Phi-3-Mini INT8 vs INT4 Decode SLO on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is the 2-second P99 SLO met for 100 output tokens using either INT8 or INT4 quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1187", "title": "Edge Transformer Cost Realization: FLOPs per Token for Small Transformers", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using the stated Phi-3-Mini layer shape, what are decode FLOPs per token versus 2N, and is Orin compute- or bandwidth-bound?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1188", "title": "Jetson Orin KV Cache and INT4 Model Size Budget at 2.5 ms/token", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the correct KV cache size and max INT4 model size after reserving KV for 5 users at context 128?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1189", "title": "Achieve Mastery in Edge Queueing System Design Under Intermittent Connectivity", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you model buffering, overflow, and reconnection bursts for Hailo-8 drones disconnected for 20 minutes at 4 req/s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1190", "title": "Diagnose Attention Kernel Underutilization on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the correct theoretical latency for 4.9M MACs on a 13 TOPS DLA, and why is the 25ms estimate completely wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1191", "title": "Diagnose Systolic Array Pipeline Stall on Coral Edge TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does rate-4 dilation drop the model's FPS so drastically, and how would you fix it?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 1}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1192", "title": "Realize Systolic Array Output-Stationary Tiling on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What output-stationary tiling strategy fits W[512x256] x X[256x128] on the accelerator, and what is the total SRAM traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1193", "title": "Recall Transformer Inference Latency on Hailo-8", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the expected single-image inference latency for the ViT-B/16 model?", "chain_ids": ["edge-chain-auto-017-11"], "chain_positions": {"edge-chain-auto-017-11": 0}, "chain_tiers": {"edge-chain-auto-017-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1194", "title": "Analyze Prefill vs Decode Cost Split on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do prefill and decode times split for a 7B INT4 model on Orin NX with a 512-token prompt and 128-token response, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1195", "title": "Design Speculative Decoding for Edge LLM on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What speculation window k should you choose for the 7B target and 1B draft at α=0.80, and what speedup results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1196", "title": "Edge GPU Continuous Batching Queue for LLM Serving", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "For an edge GPU continuous batching queue, should P99 be estimated from waiting time Wq or total sojourn time W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1197", "title": "Diagnose KV Cache Eviction Causing Latency Spikes on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does KV cache migration occur at only 8 concurrent requests on Orin, and what cache-management fix removes the 2.8s tail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1198", "title": "Diagnose Decode Latency Regression from KV Cache Format on Hailo-8", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Hailo-8 transformer slow from 8ms/token to 22ms/token after step 32, and what KV layout fixes it?", "chain_ids": ["edge-chain-auto-017-11"], "chain_positions": {"edge-chain-auto-017-11": 1}, "chain_tiers": {"edge-chain-auto-017-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1199", "title": "Edge Transformer Fluency: Memory-Bandwidth-Bound Decode Calculation", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the KV cache size at context=256 for 32 layers, 32 heads, head_dim=128, and FP16, and why isn't it 1.07GB?", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 0}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1200", "title": "Edge Transformer Fluency: Prefill FLOPs on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is the attention cost per layer about 8.59 GFLOPs rather than TFLOPs for this prefill calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1201", "title": "Optimize KV Cache Quantization for Jetson Orin Memory Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Do any listed KV quantization choices fit 5 concurrent 16K-context requests in the 4.5GB KV budget?", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 2}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1204", "title": "Making a 3B Model with 16K Context Fit on an Edge SoC", "topic": "flash-attention", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you make a 3B model with 16K context feasible on an edge SoC, including FlashAttention, KV quantization, and attention pattern?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1206", "title": "Speculative Decoding on Jetson Orin", "topic": "speculative-decoding", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which draft strategy gives the best expected speculative decoding throughput on Jetson Orin under the 102GB/s bandwidth limit?", "chain_ids": ["edge-chain-auto-secondary-017-42"], "chain_positions": {"edge-chain-auto-secondary-017-42": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1207", "title": "N-Gram Draft for Zero-Overhead Edge Speculation", "topic": "speculative-decoding", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you implement K=8 prompt-lookup drafting for a 4096-token prompt on Orin, and what acceptance rates and memory cost should you expect?", "chain_ids": ["edge-chain-auto-secondary-017-42"], "chain_positions": {"edge-chain-auto-secondary-017-42": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-42": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1208", "title": "Edge TPU Operator Fallback Penalty", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU inference take 45ms instead of 5ms, and what is the exact USB I/O penalty for the 5MB tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1209", "title": "Continuous Learning Activation Memory Tradeoff", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the 7B fine-tune on the edge accelerator OOM despite 7GB weights, and how does gradient checkpointing trade compute for memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1210", "title": "Adversarial Vulnerability from Edge Quantization", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did INT8 quantization make the Jetson Orin model vulnerable to physical adversarial patches, and why was higher precision infeasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1211", "title": "Latency Spikes in Dynamic Batching on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do p99 latency spikes occur at 60 FPS with a 15ms dynamic batching timeout on Jetson Orin despite compute headroom?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1212", "title": "Latency Spikes in Cascaded Edge TPU Pipelines", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Stage 2 trigger a 20ms latency spike on Coral Edge TPU despite the pipeline needing only 0.1 TOPS of compute?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1213", "title": "Analyzing Memory-Bound INT8 Throughput on Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does throughput flatline around 82 TOPS instead of reaching the Jetson Orin's 275 TOPS INT8 peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1214", "title": "Calibration Data Pruning for Edge TPU Quantization", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did using 10,000 random calibration images improve Coral Edge TPU INT8 accuracy versus the full 1M-image dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1215", "title": "Analyzing Hailo-8 Host Memory Streaming Bottlenecks", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the Hailo-8 starving at 15% utilization with the 1080p pipeline capped at 60 FPS, and where is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1216", "title": "INT8 Quantization Impact on Edge Data Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Given the Coral's INT8 requirement, why did the data validation accuracy degrade after the 4x sensor range increase, and what is the compute utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1217", "title": "Bandwidth Bottlenecks in Edge Data Curation", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the Hailo-8 active learning pipeline miss small distant objects after downsampling 4K video to 720p?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1218", "title": "Drift Detection Bottleneck on Dataflow Edge", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does extracting an intermediate feature map metric cause such a severe latency degradation on a dataflow architecture?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1219", "title": "Hailo-8 Host-Device DMA Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What host memory-management issue causes high CPU use and caps the Hailo-8 pipeline at 166 FPS despite low accelerator utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1220", "title": "Encoder vs Decoder Utilization on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the decoder generation arithmetic intensity on the Jetson Orin, and why does it bottleneck while the encoder does not?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1221", "title": "Thermal Throttling in Memory-Bound Attention Layers", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do the self-attention layers hit the 60W TDP and throttle despite doing fewer INT8 MACs than the MLP layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1222", "title": "GPTQ 3-bit Latency Degradation on Ampere", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 3-bit GPTQ make the 70B model fit on Jetson Orin but still degrade token-generation performance?", "chain_ids": ["edge-chain-auto-secondary-006-01"], "chain_positions": {"edge-chain-auto-secondary-006-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1223", "title": "INT8 Quantization Bias on Cloud AI 100", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does global uniform INT8 quantization on the Cloud AI 100 raise Group B's false positive rate while Group A is unaffected?", "chain_ids": ["edge-chain-auto-secondary-009-30"], "chain_positions": {"edge-chain-auto-secondary-009-30": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1224", "title": "Federated Averaging Bottleneck on Jetson Orin", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Explain why the nodes are idling and why timeouts occur despite the massive compute capacity of the Jetson Orin?", "chain_ids": ["edge-chain-auto-017-01"], "chain_positions": {"edge-chain-auto-017-01": 0}, "chain_tiers": {"edge-chain-auto-017-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1225", "title": "Analyze INT8 Underutilization on Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Cloud AI 100 achieve only 50 TOPS on this INT8 kernel while its LPDDR4x interface is saturated at 100 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1226", "title": "Thermal Throttling and Task Shedding on Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why should the drone completely shed the auxiliary model instead of proportionally downscaling frequencies for both workloads under the 45W cap?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1227", "title": "Analyzing Fusion Memory Spills on Qualcomm Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does fusing the three 100-GOPS blocks into one 300-GOPS kernel make inference slower on the Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1228", "title": "Analyzing Memory Bottlenecks in Unfused Operations on Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do separate LayerNorm, add, and GELU CUDA kernels underutilize the Jetson Orin GPU, and how does fusion fix it?", "chain_ids": ["edge-chain-auto-025-14"], "chain_positions": {"edge-chain-auto-025-14": 0}, "chain_tiers": {"edge-chain-auto-025-14": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1229", "title": "Distillation versus Unstructured Pruning on Dataflow Accelerators", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning not speed up the Hailo-8 model while a 50-GOPS dense distilled model doubles FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1230", "title": "Latency Bottleneck on Qualcomm AI 100", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the batch-8 pipeline take 265ms end to end when accelerator inference is only 15ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1231", "title": "Dataflow Accelerator Round-Robin Degradation", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does dynamic round-robin routing cause this performance degradation on this specific hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1232", "title": "Hailo-8 Host Streaming Bandwidth Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Hailo-8 pipeline achieve only 16 FPS despite 26 TOPS of compute for a 200-GOPS-per-frame model?", "chain_ids": ["edge-chain-auto-024-02"], "chain_positions": {"edge-chain-auto-024-02": 2}, "chain_tiers": {"edge-chain-auto-024-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1233", "title": "Mmap RAM Tradeoffs for Edge TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mmap reduce host RAM from 100MB to 20MB for five Coral Edge TPU processes, and how do the INT8 weights still reach the TPU over the USB interface?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1234", "title": "Jetson Orin OOM During Batch Fine-Tuning", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does batch size 16 trigger OOM on the 32 GB Jetson Orin while 16-step gradient accumulation with micro-batch 1 fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1235", "title": "Hailo-8 INT8 Streaming Throughput and Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Assuming 8MB of INT8 activations per frame, what link-limited FPS ceiling does host pre-quantization create, and why can accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1236", "title": "CI/CD Latency Skew on Hailo-8 Dataflow Accelerator", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the TOPS-based CI/CD estimate miss the Hailo-8 canary latency spike and host memory saturation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1237", "title": "TensorRT DLA Fallback Overhead on Jetson Orin", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does TensorRT INT8 partial DLA delegation increase Jetson Orin latency from 20ms to 25ms versus all-FP16 GPU execution?", "chain_ids": ["edge-chain-auto-secondary-006-10"], "chain_positions": {"edge-chain-auto-secondary-006-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1238", "title": "Edge Compute vs Thermal Throttling on Orin", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Jetson Orin throttle at 400 FPS even though 220 TOPS is below the advertised 275 TOPS INT8 peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1239", "title": "Edge TPU Inference Bottleneck Analysis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the USB 2.0 accelerator show low utilization despite a fully INT8 10-million-parameter model and high peak compute capability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1240", "title": "Thermal Throttling on Qualcomm Cloud AI 100", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does P99 latency rise by 40% after 45 minutes at 75W in the restricted-airflow edge cabinet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1241", "title": "Hailo-8 Dataflow Topology Memory Bottleneck", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the lower-compute Model X run at 150 FPS while the higher-compute Model Y reaches 600 FPS on the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1242", "title": "Edge TPU Subgraph Partitioning and USB I/O Overhead", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does one unsupported Swish activation add 12ms of latency when the host CPU computes it in only 2ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1243", "title": "Analyzing Power Spikes on Google Coral Edge TPU", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU spike to 2W at 10 FPS, and what are the energy per inference and average power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1244", "title": "Hailo-8 Host-to-Device Streaming Latency Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is Hailo-8 inference latency 6.5 ms with only about 10% compute utilization for a 20-GOPS, 12MB-per-frame model?", "chain_ids": ["edge-chain-auto-secondary-010-01"], "chain_positions": {"edge-chain-auto-secondary-010-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1245", "title": "Unstructured Sparsity Latency Stagnation", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 75% unstructured pruning reduce model storage but produce zero latency speedup on the Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1246", "title": "Asymmetric vs Symmetric Quantization Overhead", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does switching from symmetric to asymmetric INT8 quantization slow dense layers on the Cloud AI 100?", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 1}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1247", "title": "Hardware-Constrained Safety Guardrail Degradation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU silently stop logging the 30-GOPS guardrail at 60 FPS during peak activity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1248", "title": "Jetson Orin Roofline Bottleneck Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the roofline model, explain why the accelerator achieves only about 40.8 TOPS for batch-1 ViT inference despite a 275 TOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1249", "title": "Unified Memory Bandwidth Starvation", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do four 8K60 streams drop frames on Jetson Orin despite low compute use, and how much LPDDR5 bandwidth do the explicit copies consume?", "chain_ids": ["edge-chain-auto-secondary-014-11"], "chain_positions": {"edge-chain-auto-secondary-014-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-11": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1250", "title": "Hailo-8 Data Streaming Bottleneck Analysis", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What TOPS does the Hailo-8 achieve on this layer, and why is it underutilized despite a 26 TOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1251", "title": "Edge TPU Thermal Throttling Analysis", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral USB accelerator drop from 30 FPS to 15 FPS, and what sustained power avoids throttling at 40°C ambient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1252", "title": "Edge TPU Sequence Scaling Bottleneck Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does increasing ViT sequence length from 196 to 784 tokens make Coral Edge TPU latency exceed 240ms and trigger host thrashing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1253", "title": "KV-Cache Budgeting on Cloud AI 100", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the batch-32, 2048-token INT8 13B LLM workload exceed the 32 GB memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1254", "title": "Shadow Deployment on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you shadow-deploy the 150 TOPS model without causing thermal throttling or OOMs in the 80 TOPS safety path?", "chain_ids": ["edge-chain-auto-secondary-011-02"], "chain_positions": {"edge-chain-auto-secondary-011-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1255", "title": "On-Device Fine-Tuning Checkpoint Architecture for Jetson Orin", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect a gradient checkpointing strategy to fine-tune the ViT within the 32 GB memory and 60W TDP constraints?", "chain_ids": ["edge-chain-auto-secondary-003-18"], "chain_positions": {"edge-chain-auto-secondary-003-18": 3}, "chain_tiers": {"edge-chain-auto-secondary-003-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1256", "title": "Adversarial Defense Architecture on Hailo-8 Dataflow Accelerator", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which real-time adversarial-patch defense would you deploy on the streaming NPU, and how would you fit it into its host-streamed dataflow?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1257", "title": "Dataflow Batching Architecture for Multi-Camera Edge Streams", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you use static or dynamic batching for the four 1080p streams on Hailo-8, and what scheduling policy meets a 15ms p99 SLA?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1258", "title": "On-Premise Continuous Learning Data Selection", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design on-device pruning and coreset selection on Cloud AI 100 while avoiding uplink overload and synthetic-data collapse?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1259", "title": "Data Validation and Lineage on Qualcomm Cloud AI 100", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect on-premise data validation and lineage tracking on Cloud AI 100 without starving the defect detectors?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1260", "title": "Active Learning Data Pipeline for Edge TPU Defect Detection", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you select 1,000 weekly images for annotation from Coral Edge TPU devices given INT8-only, limited-operator hardware?", "chain_ids": ["edge-chain-auto-secondary-003-23"], "chain_positions": {"edge-chain-auto-secondary-003-23": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1261", "title": "Edge Drift Detection Architecture on Coral TPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you detect data drift on Coral Edge TPUs under a 100 KB/s uplink and INT8-only operator constraints?", "chain_ids": ["edge-chain-auto-secondary-009-27"], "chain_positions": {"edge-chain-auto-secondary-009-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1262", "title": "Architecting Zero-Copy DMA for Multi-Camera Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you move preprocessed INT8 tensors from host memory to the USB Coral Edge TPU with minimal latency and CPU overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1263", "title": "Architecting Sequence Models for Hailo-8 Dataflow", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which sequence architecture, encoder-only, decoder-only, or hybrid, best fits a host-streamed 26 TOPS INT8 accelerator at 2.5W?", "chain_ids": ["edge-chain-auto-secondary-011-26"], "chain_positions": {"edge-chain-auto-secondary-011-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1264", "title": "Architecting an Energy-Efficient Pipeline on Hailo-8", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the vision pipeline to minimize total energy per inference despite costly host DRAM streaming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1265", "title": "Architecting Extreme Quantization for Hailo-8 Streams", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can sub-4-bit weight storage relieve Hailo-8 host bandwidth, and what deployment constraint decides whether it helps?", "chain_ids": ["edge-chain-auto-secondary-006-02"], "chain_positions": {"edge-chain-auto-secondary-006-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1266", "title": "Architecting a Real-Time Vision Pipeline on Jetson Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you partition object detection and segmentation across Jetson Orin's Ampere GPU and DLA while managing 32 GB LPDDR5 bandwidth and the 60W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1267", "title": "Thermal and Bandwidth Graceful Degradation on Hailo-8", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What graceful degradation ladder would keep the Hailo-8 robot fail-operational when host thermal throttling cuts memory bandwidth preventing the execution of its full suite of models at 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-011-06"], "chain_positions": {"edge-chain-auto-secondary-011-06": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1268", "title": "Architecting Edge TPU INT8 Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you distill the FP32 teacher into an Edge TPU student that runs entirely in INT8 without CPU fallback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1269", "title": "Architecting a Vision-Language Pipeline on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you decompose the VLM pipeline so a 4B model meets the 150 ms sensor-to-control latency budget, and how do you map it to hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1270", "title": "PCIe Coral Edge TPU Cluster Routing", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route 200 FPS from 10 cameras across 4 PCIe Coral Edge TPUs while minimizing copies and respecting INT8-only execution?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1271", "title": "Architecting Multi-Tenant Memory-Mapped Inference for Edge", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you memory-map and share a dozen model weights to allow processes to switch lines without cold starts within 32 GB?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1272", "title": "Zero-DRAM Accelerator Host Memory Scheduling", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you manage host memory for the Hailo-8 cascaded 4-stream pipeline to avoid fragmentation, memory pressure, and OOM kills?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1273", "title": "Edge Video Analytics Pipeline Architecture", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect a pipeline to convert an FP32 PyTorch model to run on the USB Coral Edge TPU while preserving small-object accuracy?", "chain_ids": ["edge-chain-auto-secondary-011-28"], "chain_positions": {"edge-chain-auto-secondary-011-28": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1274", "title": "Architecting an INT8 Dataflow Pipeline for Hailo-8", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you convert and partition the vision model for Hailo-8 so unsupported ops do not create costly host round trips?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1275", "title": "Architecting Multi-Model Inference on Qualcomm Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you quantize and place the ASR, LLM, and TTS models on one Cloud AI 100 so they fit in 32 GB and stay within 75W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1276", "title": "Operator Scheduling Architecture on Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you statically schedule concurrent models on Cloud AI 100 to reuse 32 GB memory, and what utilization do the given numbers support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1277", "title": "Coral Edge TPU Pipeline Bottleneck Analysis Design", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you profile the pipeline to locate whether latency comes from I/O, CPU pre/post-processing, or operator fallbacks?", "chain_ids": ["edge-chain-auto-secondary-010-02"], "chain_positions": {"edge-chain-auto-secondary-010-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1278", "title": "Architecting On-Premise Guardrails for PII Redaction", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you architect a fail-safe pipeline fitting PII, toxicity, and injection checks alongside the main generative model within the 32 GB memory constraint?", "chain_ids": ["edge-chain-auto-secondary-010-05"], "chain_positions": {"edge-chain-auto-secondary-010-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1279", "title": "Edge Video Storage Pipeline for NVIDIA Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you structure the on-device storage formats to ensure the 32 GB LPDDR5 memory is not bottlenecked while accommodating both the heavy video writes and metadata querying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1280", "title": "Zero-Copy Video Ingestion Pipeline for Hailo-8", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design host-side ingestion and memory management for 4x1080p at 30 FPS on a DRAM-less Hailo-8 to avoid bandwidth bottlenecks?", "chain_ids": ["edge-chain-auto-secondary-014-12"], "chain_positions": {"edge-chain-auto-secondary-014-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1281", "title": "Systolic Array Tiling for USB Edge Accelerators", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose weight- versus output-stationary tiling to keep the INT8 array busy without CPU fallbacks?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 2}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1282", "title": "Architecting Thermal Management for Unconditioned Edge Deployments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you manage scheduling and thermal headroom to meet burst-latency SLAs without throttling in a hot telco edge enclosure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1283", "title": "Architecting a Multi-Model Pipeline on Jetson Orin under VRAM Constraints", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate memory and execution on Jetson Orin so a 7B LLM and three vision models avoid OOMs under the 32 GB unified memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1284", "title": "Multi-Model Data Streaming Bottleneck on Hailo-8", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the Hailo-8 pipeline jumping from two 2 ms models to over 18 ms end-to-end latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1285", "title": "Diagnosing Power Throttling in Low-Compute Edge Models", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Cloud AI 100 hit its 75W power limit with only 15% INT8 compute utilization but nearly saturated memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1286", "title": "Diagnosing Edge TPU Compiler Graph Partitioning", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you identify the root cause of this compilation failure, and what constraints of the hardware architecture typically lead to this symptom?", "chain_ids": ["edge-chain-auto-secondary-006-28"], "chain_positions": {"edge-chain-auto-secondary-006-28": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1287", "title": "Hailo-8 Model Thrashing with mmap Weight Loading", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the mmap-based three-model pipeline stutter and drop below 1 TOPS even though each 15 MB model runs alone?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1288", "title": "Hailo-8 Host Bandwidth Exceeded Error Diagnosis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is host memory bandwidth exhausted and causing Hailo-8 timeouts even though the INT8 weights are only 15 MB?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1289", "title": "Diagnosing Latency Spikes on Edge TPUs", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause of this degradation on the Google Coral Edge TPU, and how would you diagnose it?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1290", "title": "Diagnosing Guardrail Context-Switching", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the hardware-level root cause of this system degradation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1291", "title": "Diagnosing Low Utilization on Jetson Orin DLA", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on the architecture of a systolic array accelerator like the DLA, what is the root cause of this poor utilization?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 2}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1292", "title": "Diagnosing Sudden Inference Latency Spikes on Hailo-8 Under Load", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause of the Hailo-8 throughput collapse in the 45°C enclosure?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1293", "title": "Canary Rollout Latency Spike on Edge TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this deployment failure?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1294", "title": "Adversarial Power Spikes on Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 70W power spikes and thermal throttling on identical inputs, and what are the security implications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1295", "title": "Diagnosing Latency Spikes in Dynamic Batching on Edge Accelerators", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does dynamic batching with max batch 16 and 10ms timeout cause 30ms latency and only 300 req/sec at 600 req/sec traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1296", "title": "Diagnosing MobileNetV3 CPU Fallback", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural features of this specific CNN design are causing this severe performance bottleneck on the Coral accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1297", "title": "Diagnosing Cost Overruns in Edge LLM Deployment", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did the compute-only estimate fail, and what is the actual hardware bottleneck limiting performance to 13.6 tokens/second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1298", "title": "Coreset Memory Bottleneck on Dataflow Edge", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the root cause of this underutilization?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1299", "title": "Host Data Validation Starving Hailo-8 Dataflow", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did adding host-side data validation drop throughput from 300 FPS to 45 FPS and Hailo-8 utilization to 15%?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1300", "title": "INT8 Calibration Dataset Bias at the Edge", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the INT8 Jetson Orin deployment miss pedestrians specifically at dusk and night despite strong FP32 performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1301", "title": "Latency Spikes During Edge Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FP32 KL/PSI drift monitor cause a 40% latency spike in the primary INT8 model on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1302", "title": "Diagnosing High Latency in Jetson Orin Unified Memory", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this bottleneck on this specific SoC architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1303", "title": "Diagnosing Power Throttling in Memory-Bound Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the Cloud AI 100 hitting its 75W TDP and throttling while INT8 compute utilization stays below 15%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1304", "title": "INT8 Quantization Impact on Equalized Odds", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hardware-mandated transformation caused this localized degradation, and how do you diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1305", "title": "Diagnosing CPU Fallback on Edge TPU", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural issue causes 120ms latency and 100% host CPU utilization on the Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1306", "title": "Debugging Fallback OOM on Cloud AI 100", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this failure during the fallback transition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1307", "title": "Distilled INT8 Quantization Collapse", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What causes this severe accuracy degradation specific to the distilled model after INT8 quantization?", "chain_ids": ["edge-chain-auto-secondary-008-22"], "chain_positions": {"edge-chain-auto-secondary-008-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1308", "title": "Diagnosing OOM from Memory Fragmentation on AI 100", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of this failure?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1309", "title": "Diagnosing FP16 Overflow on Jetson Orin", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the FP16 TensorRT deployment drop from 85% to 55% mAP with collapsing bounding boxes?", "chain_ids": ["edge-chain-auto-secondary-011-30"], "chain_positions": {"edge-chain-auto-secondary-011-30": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1310", "title": "Diagnosing Operator Fallback on Qualcomm Cloud AI 100", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of the operator fallback on the Cloud AI 100, and how do you diagnose the specific coverage gaps?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1311", "title": "Diagnosing Intermittent Latency Spikes on Google Coral Edge TPU", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the root cause of these performance anomalies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1312", "title": "Edge TPU Multi-Camera Batching Evaluation", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use static batching of 4 frames or sequential batch-1 processing to minimize latency while meeting 120 FPS?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1313", "title": "Evaluating MobileNetV2 vs ResNet on Hailo-8 Dataflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture do you choose for this specific hardware, and how do the platform's memory characteristics drive your decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1314", "title": "Heterogeneous Model Routing on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Between running both models sequentially on the Ampere GPU (Architecture A) or pinning the router to the DLA and the LLM to the GPU (Architecture B), which architecture is better and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1315", "title": "Coral Edge TPU Model Selection and Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which model, Alpha or Beta, is the most cost-effective choice for 4 cameras at 15 FPS on a Coral Edge TPU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1316", "title": "Evaluating Active Learning for Cloud AI 100", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy do you choose, and how do you justify the tradeoff between selection quality and operational feasibility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1317", "title": "Evaluating Drift Detection Architectures for High-Throughput Edge", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which design is better and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1318", "title": "Optimizing PCIe DMA for 4K Video on Cloud AI 100", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the Cloud AI 100 pipeline use pinned-memory async DMA or zero-copy PCIe access for 4K60 frames, and why?", "chain_ids": ["edge-chain-auto-secondary-008-18"], "chain_positions": {"edge-chain-auto-secondary-008-18": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1319", "title": "Encoder vs Decoder on Edge TPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the offline smart home hub use a MobileBERT-style encoder or a small quantized GPT-style decoder on the Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1320", "title": "Evaluating Operator Energy Tradeoffs on Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which model is more energy-efficient on the 2W Coral Edge TPU: fewer ops with 50M DRAM accesses or twice the ops fitting in SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1321", "title": "Evaluating INT8 Quantization Bias on Hailo-8", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you choose FP32 fairness plus overall INT8 accuracy (Pipeline A), or evaluate equalized odds on the Hailo-8 hardware (Pipeline B)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1322", "title": "Evaluating Edge Accelerators vs Traditional GPUs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture minimizes system-level latency and power for this pipeline, and how does the Hailo-8's lack of local DRAM alter your model optimization strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1323", "title": "Evaluating Degradation Strategies on Coral Edge TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach (Strategy A vs B) provides better fail-operational reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1324", "title": "Knowledge Distillation vs Pruning on Qualcomm Cloud AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you compress the 7B router with 80% structured pruning or distill it into a dense 1.4B student for Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1325", "title": "Evaluating Request Routing on Qualcomm Cloud AI 100", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the 8-card Cloud AI 100 cluster use model-ID consistent hashing or queue-depth weighted round-robin for request routing?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1326", "title": "Evaluating Memory-Mapped Weight Loading on Jetson Orin", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which approach optimizes cold start time and memory footprint on an NVIDIA Jetson Orin (32 GB LPDDR5), and what are the system tradeoffs?", "chain_ids": ["edge-chain-auto-secondary-008-25"], "chain_positions": {"edge-chain-auto-secondary-008-25": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1327", "title": "Evaluating Memory Management Strategies on Google Coral Edge TPU", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach provides the better stability and performance tradeoff for deterministic real-time inference: fixed-size pinned-memory pools, or OS eviction with batch size reduced to 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1328", "title": "Mixed-Precision Strategy for On-Premise LLM", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should the 20B LLM use W8A8 or W8A16, and how do memory bandwidth and accuracy drive the choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1329", "title": "Operator Scheduling Tradeoffs on Jetson Orin GPU vs DLA", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which strategy do you choose to maximize throughput while staying within the 60W power budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1330", "title": "Cloud AI 100 I/O Bottleneck Evaluation", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which design better addresses the latency bottleneck and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1331", "title": "Evaluating Guardrail Deployment on NVIDIA Jetson Orin Edge Devices", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which guardrail architecture is better suited for this deployment, considering strict latency constraints and the Orin's 60W TDP limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1332", "title": "Format Selection for Host-Streamed Hailo-8 Inference", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the offline inference logs be stored as Zstd Parquet or uncompressed flat binary records to maximize accelerator utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1333", "title": "Audio Stream Pipeline for Coral Edge TPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which pipeline is better for latency and system stability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1334", "title": "Dataflow Tradeoffs on Qualcomm Cloud AI 100", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the ViT-L MLP layers, would you use a weight-stationary or output-stationary dataflow, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1335", "title": "Evaluating Thermal Strategies for Jetson Orin Deployment", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For a continuous 150 TOPS workload at 45°C ambient, would you use active cooling or passive burst-and-idle scheduling, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1336", "title": "Shadow Deployment Memory Constraints on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given that V1 uses 6 GB of memory and V2 uses 7 GB, how much shared memory remains during the shadow rollout?", "chain_ids": ["edge-chain-auto-secondary-011-02"], "chain_positions": {"edge-chain-auto-secondary-011-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1337", "title": "Adversarial Defense Overhead on Hailo-8", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total end-to-end latency and accelerator streaming-window host memory read bandwidth for one frame with 3 perturbed 1080p inputs on the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1338", "title": "Hailo-8 Dataflow Batching Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If 4 synchronized cameras are statically batched on the Hailo-8, what is the theoretical minimum compute latency for the batch assuming 100% MAC utilization?", "chain_ids": ["edge-chain-auto-secondary-005-10"], "chain_positions": {"edge-chain-auto-secondary-005-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1339", "title": "Compound Pipeline Throughput on Cloud AI 100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the maximum theoretical throughput and energy per request for the sequential router-plus-expert pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1340", "title": "Hailo-8 Inference Throughput and Power Efficiency Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and frames per Joule for the 65 GOPS/frame Hailo-8 detector?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 1}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1341", "title": "Coreset Selection for INT8 Calibration", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How large is the 0.5% uncompressed calibration coreset, and does it fit in 32 GB of memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1342", "title": "Validation Compute Budget on Cloud AI 100", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much Cloud AI 100 compute remains for a pre-validation anomaly detection model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1343", "title": "Active Learning Budget on Coral Edge TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the daily labeling cost per device and Coral TPU compute fraction for the 50 MOPS uncertainty estimator at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1344", "title": "Drift Detection Latency Budget on Google Coral", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 15 ms inference in a 20 ms frame budget, is 10-bin PSI on the host CPU feasible and how much latency budget remains?", "chain_ids": ["edge-chain-auto-secondary-009-27"], "chain_positions": {"edge-chain-auto-secondary-009-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1345", "title": "PCIe DMA Overhead on Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the PCIe DMA transfer time for one uncompressed 1080p RGB INT8 frame, and is the system compute-bound or data-movement-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1346", "title": "Hailo-8 INT8 Energy Per Operation Math", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the average energy per INT8 operation at 26 TOPS and 2.5W, and how does it compare with an INT8 MAC baseline?", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1347", "title": "On-Device Demographic Parity Energy Budget", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the Demographic Parity difference and total energy consumed (in Watt-hours) if evaluating 1,000 records/sec at 60W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1348", "title": "Jetson Orin INT8 Compute Utilization for Vision Transformers", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What INT8 compute throughput is required for 30 FPS, and what is the theoretical minimum Jetson Orin utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1349", "title": "Distilling to INT8 for Google Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you design the Coral Edge TPU student distillation pipeline, and what theoretical max FPS does a 50 GOPs/frame student achieve?", "chain_ids": ["edge-chain-auto-secondary-008-19"], "chain_positions": {"edge-chain-auto-secondary-008-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1350", "title": "Load Balancing Requests Across Google Coral TPUs", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At 200 requests/sec with 50 GOPS/request, how much compute is required, can one Coral TPU handle it, and how should traffic be routed across 4 TPUs?", "chain_ids": ["edge-chain-auto-secondary-006-07"], "chain_positions": {"edge-chain-auto-secondary-006-07": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1351", "title": "Multi-Process mmap on Qualcomm Cloud AI 100", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For three INT8 8B LLM processes on 32 GB Cloud AI 100, what are the weight footprints with mmap versus independent loading and remaining memory?", "chain_ids": ["edge-chain-auto-secondary-008-26"], "chain_positions": {"edge-chain-auto-secondary-008-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1352", "title": "Host Buffer Sizing for Dataflow Edge Streaming", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much pinned host memory is needed to buffer enough 6 MB frames for a 50 ms stall with 4 MB contiguous pages?", "chain_ids": ["edge-chain-auto-secondary-011-08"], "chain_positions": {"edge-chain-auto-secondary-011-08": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1353", "title": "Estimating ResNet-50 Inference Speed on Coral Edge TPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the Coral Edge TPU's maximum theoretical FPS for an 8 GMAC/image model, and what precision constraint must the team understand?", "chain_ids": ["edge-chain-auto-secondary-011-28"], "chain_positions": {"edge-chain-auto-secondary-011-28": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1354", "title": "Dual-Branch Network Scheduling on Qualcomm AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the peak SRAM footprints for the A-then-B and B-then-A branch schedules, and which order minimizes spilling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1355", "title": "USB I/O Bottleneck on Coral Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum per-frame latency including TPU compute and USB transfer, and what is the primary bottleneck?", "chain_ids": ["edge-chain-auto-secondary-010-02"], "chain_positions": {"edge-chain-auto-secondary-010-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1356", "title": "Guardrail Memory Footprint on Cloud AI 100", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What total INT8 weight memory do the guardrails and 24B model require, and how much of the 32 GB remains for KV cache and activations?", "chain_ids": ["edge-chain-auto-secondary-010-05"], "chain_positions": {"edge-chain-auto-secondary-010-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1357", "title": "Edge Telemetry Storage Sizing on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 3 GB uncompressed log buffer last, and what is the hourly Parquet disk footprint with 4x compression if you switch to a columnar format?", "chain_ids": ["edge-chain-auto-secondary-008-31"], "chain_positions": {"edge-chain-auto-secondary-008-31": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1358", "title": "Hailo-8 Multi-Camera Ingestion Bandwidth", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What continuous host-to-Hailo-8 bandwidth is required to stream six uncompressed 1080p RGB cameras at 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-014-12"], "chain_positions": {"edge-chain-auto-secondary-014-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1359", "title": "Google Coral INT8 Throughput and Energy Calculation", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum IPS and energy per inference for the 200M-MAC INT8 model on the 4 TOPS Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1360", "title": "Calculate Sustained TOPS Under Edge Thermal Throttling", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming a linear relationship between power draw and compute throughput when throttled, with passive cooling limited to 60W, what maximum sustained INT8 TOPS can the Cloud AI 100 deliver?", "chain_ids": ["edge-chain-auto-secondary-006-17"], "chain_positions": {"edge-chain-auto-secondary-006-17": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1361", "title": "Active Learning Storage on Qualcomm Cloud AI 100", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If 0.05% of processed frames trigger the filter and each requires 1.5 MB, how much daily storage is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1362", "title": "Edge Guardrail Memory and Latency Sizing", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum INT8 safety model parameter count in the remaining memory, and its theoretical latency for a 100-token sequence at 20% TOPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1363", "title": "Hailo-8 Host Memory Bandwidth for Data Logging", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum logging FPS can the host sustain for uncompressed row frames versus 4x-compressed TFRecord with 200 MB/s overhead?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1364", "title": "Shadow Deployment Sizing on Hailo-8", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 14 TOPS reserved for Model A, what maximum shadow FPS can Model B run at without exceeding 26 TOPS, and what is the estimated total power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1365", "title": "Jetson Orin GPU vs DLA Efficiency", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and Joules per frame for DLA-only versus GPU-only deployment of the 250 GOPS/frame model?", "chain_ids": ["edge-chain-auto-secondary-008-11"], "chain_positions": {"edge-chain-auto-secondary-008-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1366", "title": "Activation Spilling Latency on Dataflow Accelerators", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What latency penalty does spilling and reloading the 256x256x64 INT8 activation add over a 4 GB/s host interface?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1367", "title": "Randomized Smoothing on Edge TPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At exactly 20 FPS, what maximum randomized smoothing pass count N can the Coral run, and what energy is consumed per smoothed frame?", "chain_ids": ["edge-chain-auto-secondary-007-20"], "chain_positions": {"edge-chain-auto-secondary-007-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1368", "title": "Stream Bandwidth for Depthwise Convs on Hailo-8", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What host memory streaming bandwidth is required to feed the 100x100x64 depthwise-plus-pointwise layer at 10 TOPS utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1369", "title": "Throughput and Efficiency on a 4 TOPS, 2W Edge Accelerator", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and inferences per Joule for a 2 GOPs/forward-pass INT8 model on a 4 TOPS, 2W accelerator?", "chain_ids": ["edge-chain-auto-026-15"], "chain_positions": {"edge-chain-auto-026-15": 0}, "chain_tiers": {"edge-chain-auto-026-15": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1370", "title": "Compute-Bound Data Pipeline Throughput", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What data pipeline throughput in GB/s is needed to keep the 400 TOPS Cloud AI 100 fully utilized on 0.5 TOPS/frame 1080p inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1371", "title": "Image Validation Gate Throughput Calculation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 0.2% of Jetson Orin's 275 TOPS allocated, what maximum FPS can validate 10-megapixel frames at 200 INT8 ops per pixel?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1372", "title": "Edge Drift Detection Overhead", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the memory footprint in bytes and operation count required per check to calculate the KL divergence of two 256-bin FP32 histograms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1373", "title": "DMA Transfer Latency Calculation for 4K Video Batches", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact PCIe data movement latency for one batch of 8 uncompressed 4K RGB frames over the 12 GB/s link?", "chain_ids": ["edge-chain-auto-secondary-008-18"], "chain_positions": {"edge-chain-auto-secondary-008-18": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1374", "title": "Encoder-Decoder Latency on Edge TPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum latency to generate 10 tokens with a 10 GOPS encoder and 2 GOPS per decoder token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1375", "title": "Energy Cost of INT8 MACs on Google Coral", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the energy per INT8 MAC in pJ and total compute energy per inference for 10 billion MACs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1376", "title": "4-Bit Weight Packing for Coral Edge TPU Bandwidth", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With 8 GB/s bandwidth and 32M parameters, what is the maximum inference rate for packed 4-bit weights?", "chain_ids": ["edge-chain-auto-secondary-006-04"], "chain_positions": {"edge-chain-auto-secondary-006-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1377", "title": "Hailo-8 Fairness Evaluation Energy & Disparity", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the Equal Opportunity TPR disparity, theoretical FPS, and total accelerator energy for the 13,000-frame validation run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1378", "title": "Hailo-8 Compute Bound FPS Calculation", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum compute-bound FPS for a 52 GOPS model on a 26 TOPS Hailo-8 at 80% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1379", "title": "Calculate Fallback Model Size for Coral TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming 50% hardware utilization of peak compute, what is the maximum GOps per frame a fallback INT8 model can use to maintain 20 FPS under the 1W, 2 TOPS thermal cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1380", "title": "Compiler Graph Break Latency Overhead", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the effective host bandwidth for streaming is 2.0 GB/s in each direction, what is the total latency overhead added purely by this failure to fuse operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1381", "title": "Sizing the Distilled INT8 Student Model", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the largest INT8 student model that can sustain 10,000 tokens/s at 50% of 400 TOPS, and how much memory remains for KV cache?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1382", "title": "Weighted Round-Robin for Asymmetric Edge Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What weighted round-robin integer weights should the four AI 100 cards use, and what is the maximum theoretical throughput?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1383", "title": "Calculate Maximum Batch Size for LLM on Qualcomm Cloud AI 100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum batch size fits in 32 GB when weights use 20 GB, runtime uses 2 GB, and KV cache uses 1.5 GB per sequence?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 0}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1384", "title": "Multi-Process Memory-Mapped Inference Footprint", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the total memory footprints for four private 7B INT8 model loads versus shared read-only mmap weights, and will either OOM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1385", "title": "INT8 Compute Throughput on Cloud AI 100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum theoretical inference throughput for the 800 GOPS model on the 400 TOPS INT8 hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1386", "title": "CI/CD Validation Latency and Energy on Cloud AI 100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical minimum validation time and accelerator energy for 1,000,000 samples at 500 GOPS each on a 400 TOPS, 75W accelerator?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1387", "title": "Edge TPU Operator Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical minimum latency when 90% of a 2B-MAC model runs on the 4 TOPS Edge TPU and 10% on a 0.1 TOPS CPU sequentially?", "chain_ids": ["edge-chain-auto-secondary-006-12"], "chain_positions": {"edge-chain-auto-secondary-006-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1388", "title": "LLaMA 7B INT8 Memory Footprint on Jetson Orin", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What memory footprint do the 7B INT8 weights require, and how much of the Jetson Orin's 32 GB unified memory remains?", "chain_ids": ["edge-chain-auto-secondary-003-27"], "chain_positions": {"edge-chain-auto-secondary-003-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1389", "title": "Alerting on Host-to-Device Streaming Starvation", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At only 300 MB/s PCIe bandwidth, what frame drop rate should monitoring expect for uncompressed 1080p RGB input at a 60 FPS target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1390", "title": "Hardware-Aware NAS Latency Bound on Qualcomm Cloud AI 100", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What minimum compute-bound latency should the NAS controller estimate for an 800 GOPs INT8 model on a 400 TOPS accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1391", "title": "Compute Peak Memory for Sequential Operators", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the absolute peak activation memory footprint for the 3GB, 5GB, and 2GB sequential tensors, and what formula gives it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1392", "title": "Calculate Theoretical Minimum Compute Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming 100% compute utilization, what is the theoretical minimum compute latency per frame for an 800 GOPS INT8 inference on a 400 TOPS Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1393", "title": "Unstructured Sparsity Bandwidth on Hailo-8", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What weight bandwidth at 60 FPS is required for the dense 4M INT8 layer versus 50% CSR sparsity with 16-bit indices, and does CSR help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1394", "title": "Calculate thermal throttling impact on Orin throughput", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum FPS should you expect when the Jetson Orin is throttled from 60W to 25W assuming linear scaling?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1395", "title": "Zero-Copy Micro-Batching on Hailo-8 Architecture", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Because Hailo-8 is a dataflow accelerator with no local DRAM, how would you batch and schedule the detector and license plate reader to minimize PCIe contention and meet the 33ms detector SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1396", "title": "On-Device Coreset Selection for Edge Continual Learning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the AI 100 active data selection and coreset pipeline to maximize ICR while sending only a tiny fraction of 60 FPS video for labeling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1397", "title": "Zero-Copy Video Pipeline for Coral Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a zero-copy pinned-memory DMA pipeline from camera to Coral to meet the 33 ms 30 FPS deadline?", "chain_ids": ["edge-chain-auto-secondary-008-17"], "chain_positions": {"edge-chain-auto-secondary-008-17": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1398", "title": "Cross-Silo FL Model Aggregation on Qualcomm AI 100", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you size LoRA fine-tuning for the 3B model on a 32 GB AI 100 and minimize daily synchronization bandwidth across 50 hospitals?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1399", "title": "Automotive ASIL-D Certification for Lidar Perception on Jetson Orin", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What Orin architecture and architectural choices would provide deterministic ASIL-D fault detection within 50 ms for transient hardware faults and software lockups?", "chain_ids": ["edge-chain-auto-secondary-008-02"], "chain_positions": {"edge-chain-auto-secondary-008-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1400", "title": "Thermal Sizing for Fanless Hailo-8 Edge Camera", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is the 13 TOPS, 4 GB/s Hailo-8 vision pipeline thermally feasible in the 8W fanless enclosure, and what changes are needed?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1401", "title": "Edge TPU Precision Requirements for Efficient CNNs", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific numerical precision format is strictly required by the Google Coral Edge TPU for hardware acceleration?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 0}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1402", "title": "Hailo-8 Local Memory Specification", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "When designing the memory buffer for these intermediate states, how much local DRAM does the accelerator provide to store these chained inference outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1403", "title": "Qualcomm Cloud AI 100 Specifications", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the baseline INT8 throughput, memory capacity, and power consumption of a single Qualcomm Cloud AI 100 card?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1404", "title": "Hailo-8 Local DRAM Memory Architecture", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the specific capacity of local DRAM on the Hailo-8?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1405", "title": "Jetson Orin Zero-Copy Memory Architecture Identification", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What architectural feature makes zero-copy data movement possible, and what memory allocation technique is required to leverage it?", "chain_ids": ["edge-chain-auto-secondary-008-15"], "chain_positions": {"edge-chain-auto-secondary-008-15": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1406", "title": "Coral Edge TPU INT8 Efficiency", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the mandatory numerical precision required for the Google Coral Edge TPU, and what is its peak performance per watt?", "chain_ids": ["edge-chain-auto-secondary-009-28"], "chain_positions": {"edge-chain-auto-secondary-009-28": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1407", "title": "Analyzing Edge TPU Throughput Bottlenecks", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does a 20 GOPS Edge TPU model achieve only 40 FPS with high host CPU utilization despite a 200 FPS compute ceiling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1408", "title": "KV Cache OOM Analysis on Cloud AI 100", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the 7B INT8 LLM OOM on the 32 GB Cloud AI 100 at batch 64 and sequence length 2048 despite weights using only 7 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1409", "title": "Thermal Throttling of Adversarial Defenses", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does adding 5 ms randomized input smoothing drop the Jetson Orin workload to 20 FPS instead of the expected ~28 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1410", "title": "Dynamic Batching Latency Spikes on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does dynamic batching to batch size 4 with a 30 ms timeout raise p99 latency from 15 ms to 50 ms on the 120 FPS pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1411", "title": "Explaining Latency Regressions in Depthwise Convolutions on Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can replacing dense convolutions with depthwise separable convolutions slow the Cloud AI 100 despite an 8x FLOP reduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1412", "title": "Edge TPU Pipeline Throughput Degradation Analysis", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the two-stage Edge TPU pipeline reach about 40 ms latency with 90% CPU utilization, and what is the true end-to-end latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1413", "title": "Compute Efficiency and Power Limits on Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Jetson Orin hit its 60W power limit before reaching its 275 TOPS compute limit, and what is the energy cost per inference?", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 1}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1414", "title": "Information-Compute Ratio on Coral TPU", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the Coral TPU system's Information-Compute Ratio collapse at 120 FPS despite fitting within the 4 TOPS limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1415", "title": "Host Preprocessing Bottleneck on Hailo-8", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the Hailo-8 dataflow accelerator starved, and what bottleneck latency does the pipeline equation imply at 12 FPS?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 1}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1416", "title": "INT8 Clipping in Edge Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the Coral INT8 anomaly gate flag 40% of valid bright frames as anomalous during peak sunlight?", "chain_ids": ["edge-chain-auto-secondary-009-23"], "chain_positions": {"edge-chain-auto-secondary-009-23": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1417", "title": "Data Selection Bias in High-Throughput Edge Inference", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did sampling only 2 FPS for curation cause the 60 FPS Hailo-8 deployment to fail despite strong validation metrics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1418", "title": "Drift Detection I/O Bottleneck on Dataflow Edge", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does streaming a 3MB intermediate feature map to the host cap the Hailo-8 pipeline at 200 FPS despite low compute use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1419", "title": "Hailo-8 DMA PCIe Streaming Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does this architecture experience this specific performance wall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1420", "title": "Encoder-Decoder Latency Disparity on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the encoder run in 10ms while the autoregressive decoder takes 250ms on the same Jetson Orin hardware?", "chain_ids": ["edge-chain-auto-secondary-011-27"], "chain_positions": {"edge-chain-auto-secondary-011-27": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1421", "title": "Energy Cost Analysis of Memory vs Compute on Jetson Orin", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given the Horowitz energy table principles, why does memory-bound Layer B draw 45W at only 10% TOPS utilization while dense Layer A draws 20W at 80% utilization?", "chain_ids": ["edge-chain-auto-secondary-005-12"], "chain_positions": {"edge-chain-auto-secondary-005-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-005-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1422", "title": "Analyzing 3-bit GPTQ Latency Degradation on Jetson Orin", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can 3-bit GPTQ make the 14B model slower and more power-hungry on Jetson Orin than standard INT8 despite fitting in memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1423", "title": "INT8 Mixed-Precision Fairness Tradeoff on Edge Accelerators", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does pure INT8 raise one subgroup's false rejection rate, and what throughput cost comes from keeping the final 20% of compute in FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1424", "title": "Analyzing Federated Averaging Bottlenecks on Jetson Orin", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does synchronization, not local training, bottleneck the Jetson Orin federated system with 800 MB updates, and how would you reduce it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1425", "title": "Batch Size Impact on Compute Utilization", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Cloud AI 100 utilization jump from 40 sustained TOPS at batch size 1 to 320 TOPS at batch size 32?", "chain_ids": ["edge-chain-auto-secondary-007-23"], "chain_positions": {"edge-chain-auto-secondary-007-23": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1426", "title": "Power-Constrained Degradation on Jetson Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the robot switch to a 40 TOPS DLA-only model at 15 FPS instead of running the 160 TOPS GPU model at a lower framerate under the 25W limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1427", "title": "SRAM Spilling in Fused Operators on Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does fusing the convolution and GeLU make latency worse, what spill overhead does the 4MB SRAM spill add, and what is the net latency regression after the 5 microsecond launch saving?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1428", "title": "Kernel Fusion on Heterogeneous SoCs", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does fusing the DLA convolution with the custom GPU activation degrade latency even though it removes 20MB of LPDDR5 traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1429", "title": "Distillation Arithmetic Intensity on Streaming Edge Accelerators", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the 1 GOP distilled student slower when its bytes per inference exceed the 5 GOP pruned teacher on a 4 GB/s transfer path?", "chain_ids": ["edge-chain-auto-secondary-008-21"], "chain_positions": {"edge-chain-auto-secondary-008-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1430", "title": "Edge LLM Latency Breakdown on AI 100", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the end-to-end latency for the 500-token prompt and 100-token response, and why does 400 TOPS mainly help prefill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1431", "title": "Hailo-8 Host Memory Bandwidth Imbalance", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does round-robin routing create latency spikes across two identical Hailo-8 chips for mixed-resolution video streams?", "chain_ids": ["edge-chain-auto-secondary-006-09"], "chain_positions": {"edge-chain-auto-secondary-006-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1432", "title": "Hailo-8 Host Memory Streaming Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Hailo-8 reach only about 15 FPS for the 50 GOP model despite needing only 3 TOPS at 60 FPS?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1433", "title": "Coral Edge TPU mmap Initialization Over USB", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does host mmap fail to give zero-copy Coral initialization, and why is each 24MB model cold start exactly 400ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1434", "title": "OOM During Gradient Accumulation", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Jetson Orin fine-tuning job still hit OOM at the end of the final micro-batch despite 4-way gradient accumulation?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1435", "title": "Hailo-8 CI/CD Pipeline Bandwidth Bottleneck", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the Hailo-8 stay under 2% compute utilization yet cap at 50 FPS on 4MB input frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1436", "title": "TensorRT Graph Fragmentation on Jetson Orin", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do three DLA-unsupported ONNX operators disproportionately hurt Jetson Orin latency, and what is the new effective throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1437", "title": "Thermal Throttling on Jetson Orin GPU vs DLA", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving some layers from the Ampere GPU to the DLA let the 200 TOPS vision pipeline run without thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1438", "title": "Edge TPU Quantization and Bottleneck Analysis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral reject the 8M-parameter FP32 model, and what are the FP32 and INT8 memory footprints that make USB I/O dominate?", "chain_ids": ["edge-chain-auto-secondary-003-29"], "chain_positions": {"edge-chain-auto-secondary-003-29": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1439", "title": "Power Throttling Stragglers on Cloud AI 100", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do P99 latency stragglers appear when the Cloud AI 100 hits 75W, and what latency increase results from a 20% compute throttle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1440", "title": "NAS Memory-Bound Analysis on Hailo-8", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does hardware-aware NAS prefer a 4 GOPS, 4M-parameter model over a 2 GOPS, 20M-parameter model on Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1441", "title": "Edge TPU Subgraph Partitioning Transfer Overhead", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does inserting an unsupported Swish between two TPU Conv2Ds cause high latency, and what USB transfer penalty does the 1MB feature map create?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1442", "title": "A/B Partition OTA Power Throttling on Jetson Orin", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the system throttle INT8 inference during background OTA flash programming even though weights are already in memory?", "chain_ids": ["edge-chain-auto-027-18"], "chain_positions": {"edge-chain-auto-027-18": 0}, "chain_tiers": {"edge-chain-auto-027-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1443", "title": "Explain Thermal Throttling on Google Coral Edge TPU", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Coral Edge TPU inference latency double after 15 minutes of continuous 4 TOPS operation in a passively cooled enclosure?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1444", "title": "Analyzing I/O Latency Bottlenecks on Hailo-8", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is the end-to-end latency about 7ms when the Hailo-8 compute time is only 1ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1445", "title": "Unstructured Sparsity Inefficiency on Dense Accelerators", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 80% unstructured pruning fail to improve latency on the dense Cloud AI 100 INT8 accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1446", "title": "Per-Channel INT8 Overhead on AI 100", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does per-channel INT8 quantization slightly slow the Cloud AI 100, and how much extra metadata is needed for a linear layer with 4096 output channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1447", "title": "Coral Edge TPU Frame Budget Analysis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the Coral Edge TPU pipeline miss the 16.6ms frame budget even though the TPU runs the model in 8ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1448", "title": "Quantization Bias in TPU Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why did deploying the toxicity guardrail on Coral cause a minority-dialect fairness failure, and what latency and energy costs come with CPU FP32?", "chain_ids": ["edge-chain-auto-secondary-010-04"], "chain_positions": {"edge-chain-auto-secondary-010-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1449", "title": "Jetson Orin Roofline Memory Bound Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the roofline model, why does a 100 OP/byte INT8 model reach only about 7% compute utilization on Jetson Orin?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 2}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1450", "title": "Watchdog Resets Under Thermal Throttling", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the BIST watchdog reset occur under peak thermal load even though INT8 inference still meets its 20ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1451", "title": "Row vs Columnar Storage Latency and Throughput Tradeoffs", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does compressed Parquet hurt batch-1 latency but boost batch-4096 throughput, and what is the memory read reduction per batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1452", "title": "Unified Memory Contention in High-Frequency Edge Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why do latency spikes and frame drops appear at 60Hz LiDAR ingestion despite 30% accelerator utilization and 40W power draw?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1453", "title": "Streaming Dataflow Bottlenecks on Edge Accelerators", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the accelerator reach only 0.2 TOPS on this 100 MOPS, 2 MB layer despite a significantly higher peak capability?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1454", "title": "Analyzing Thermal Throttling on Google Coral Edge TPU", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the Coral Edge TPU drop from 30 FPS to 15 FPS after 15 minutes at 40°C, and what thermal tradeoff is occurring?", "chain_ids": ["edge-chain-auto-secondary-006-16"], "chain_positions": {"edge-chain-auto-secondary-006-16": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1455", "title": "Edge TPU Attention Cost Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does the 256-token transformer's attention fall back to CPU on the Edge TPU, and how would you keep it on INT8 hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1456", "title": "KV Cache Scaling on a 32 GB NVIDIA A10", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 7B INT8 LLM OOM at batch size 14 with a 4096-token context on a 32 GB NVIDIA A10?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1457", "title": "Shadow Deployment Architecture on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you run a shadow vision model on Jetson Orin without adding production latency or exceeding the 32 GB, 60W limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1458", "title": "On-Premise Video Analytics Accelerator Selection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Would you use multiple T4-class GPUs or one Cloud AI 100 for 50 1080p streams under a 150W budget, and why?", "chain_ids": ["edge-chain-auto-secondary-008-14"], "chain_positions": {"edge-chain-auto-secondary-008-14": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1459", "title": "On-Device Fine-Tuning Memory Strategy", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you use freezing and gradient checkpointing to fine-tune on Jetson Orin without OOMing 32 GB shared memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1460", "title": "Adversarial Defense Architecture for Streaming Edge Accelerators", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which single-pass adversarial-patch defense would you deploy on Hailo-8 under a 500 MB/s bus and 10 ms/frame budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1461", "title": "Hailo-8 Multi-Stream Dataflow Batching Architecture", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What batching policy keeps four 30 FPS streams under 15 ms on one Hailo-8 while sustaining 120 FPS total?", "chain_ids": ["edge-chain-auto-secondary-005-10"], "chain_positions": {"edge-chain-auto-secondary-005-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1462", "title": "On-Premise RAG Pipeline Design for Cloud AI 100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the Cloud AI 100 RAG pipeline, would you co-locate the embedding, ranker, and 8B LLM or swap them sequentially to meet 1s TTFT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1463", "title": "Multi-Model Drone Inspection Design on Hailo-8", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the true system-level cost of running four 15 GOPS/frame networks at 30 FPS on Hailo-8, and how would you avoid bandwidth stalls?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 3}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1464", "title": "Streaming Coreset Selection for Edge Continual Learning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you build a streaming coreset pipeline to select from 10 million daily logs under the 75W limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1465", "title": "Active Learning Curation for Edge TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you curate field data from Coral Edge TPU cameras to improve INT8 accuracy without exceeding edge bandwidth limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1466", "title": "Coral TPU Edge Drift Detection Architecture", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you detect lighting or sensor drift on the system without exceeding the 2W budget or relying on unsupported operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1467", "title": "Zero-Copy Video Pipeline Architecture for Google Coral Edge TPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign the 1080p60 Coral USB data pipeline to fix 100% CPU use while the Edge TPU sits at 30% utilization?", "chain_ids": ["edge-chain-auto-secondary-008-17"], "chain_positions": {"edge-chain-auto-secondary-008-17": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1468", "title": "Architecting Real-time Translation on Hailo-8", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which speech-to-text architecture would you choose for this device, and how would you handle decoding under its streaming constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1469", "title": "Architecting Energy-Efficient Streams for Hailo-8", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Architect the data ingestion and model execution flow for four 1080p streams to minimize host memory access energy per operation?", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1470", "title": "Sub-4-bit Weight Streaming for DRAM-less Edge Accelerators", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you use sub-4-bit host-side weight storage while still feeding INT8 weights efficiently to the accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1471", "title": "Architecting On-Device Fairness Evaluation for Edge Vision", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you run continuous on-device fairness evaluation without sending video off-device or exceeding 32 GB and 60W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1472", "title": "Federated Learning Architecture on Hailo-8 Dataflow Accelerators", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design on-device federated learning with Hailo-8 nodes given host streaming bottlenecks and non-IID driver data?", "chain_ids": ["edge-chain-auto-017-02"], "chain_positions": {"edge-chain-auto-017-02": 1}, "chain_tiers": {"edge-chain-auto-017-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1473", "title": "Optimizing Sensor Fusion Kernels on Jetson Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you structure CUDA thread blocks, warps, and shared memory tiling on Jetson Orin to avoid LPDDR5 bandwidth saturation?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1474", "title": "Hailo-8 Host Bandwidth Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Design a fail-operational graceful degradation ladder that maintains checkout accuracy without crashing the dataflow pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1475", "title": "Multi-Model Compilation Strategy for NVIDIA Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you map the detector, planner, and fusion graphs across Jetson Orin's GPU and DLAs within the 60W and 32 GB limits?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1476", "title": "Architecting Knowledge Distillation for Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you distill the float32 teacher into a Coral-compatible INT8 student while respecting Edge TPU operator limits?", "chain_ids": ["edge-chain-auto-secondary-008-19"], "chain_positions": {"edge-chain-auto-secondary-008-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1477", "title": "Architecting an Object Detection Pipeline on Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate the 30 ms latency budget for 4K60 drone vision on Jetson Orin across preprocessing, inference, postprocessing, and control?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1478", "title": "Multi-TPU Routing for High-Frequency Industrial Inspection", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route 8 concurrent 60 FPS streams across four Coral Edge TPUs, and is the load sustainable without admission control?", "chain_ids": ["edge-chain-auto-secondary-006-07"], "chain_positions": {"edge-chain-auto-secondary-006-07": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1479", "title": "Edge TPU Multi-Model Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which models would you keep resident in Coral Edge TPU SRAM, and when would you swap over USB for the motion/person/face cascade?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1480", "title": "Shared Weight Mapping for Multi-Model Edge Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design memory-mapped model sharing for 5 CV models across 20 streams on a 32 GB appliance?", "chain_ids": ["edge-chain-auto-secondary-008-26"], "chain_positions": {"edge-chain-auto-secondary-008-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1481", "title": "Handling Host Memory Pressure for Hailo-8 Streams", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you manage host memory for Hailo-8 DMA under fragmentation and daemon spikes in a 4 GB smart-city appliance?", "chain_ids": ["edge-chain-auto-secondary-011-08"], "chain_positions": {"edge-chain-auto-secondary-011-08": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1482", "title": "Hailo-8 Multi-Model Pipeline Design and Host Memory Offloading", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you fuse the detector, pose, and action models for Hailo-8 or deploy separate ONNX graphs, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1483", "title": "Architecting a Multi-Model Pipeline on Qualcomm Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you lay out memory and schedule detector, tracker, and transformer Re-ID execution on Cloud AI 100 for batches up to 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1484", "title": "Multi-Tenant Operator Scheduling on Qualcomm Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule multiple transformer models on Cloud AI 100 to improve INT8 TOPS utilization without memory thrashing?", "chain_ids": ["edge-chain-auto-secondary-008-29"], "chain_positions": {"edge-chain-auto-secondary-008-29": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1485", "title": "Architecting a Resilient OTA Update System for Jetson Orin", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design OTA updates for Jetson Orin robots to allow zero-downtime downloads and automatic rollback after boot failure?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 4}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1486", "title": "Architecting Low-Latency Profiling for Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you profile Coral Edge TPU latency spikes to separate USB bandwidth, CPU-transfer, and operator-fallback bottlenecks without significantly impacting the device's 2W power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1487", "title": "Architecting Structured Sparsity on Jetson Orin", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Would you use 80% unstructured pruning for this edge model, and what hardware-aware sparsity strategy would you deploy?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 3}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1488", "title": "Designing a Hybrid Quantization Strategy for Jetson Orin", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose PTQ vs QAT, calibration granularity, and GPU/DLA mapping for a 7B multimodal pipeline on Jetson Orin?", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 2}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1489", "title": "On-Premise Guardrail Architecture for Legal Summarization", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you allocate hardware resources between the main summarization model and guardrails to meet a 2-second end-to-end latency SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1490", "title": "Edge Sensor Data Ingestion Pipeline Architecture on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What storage format and compression strategy would you use on Jetson Orin to handle 1.5 GB/s sensor ingress without starving inference?", "chain_ids": ["edge-chain-auto-secondary-008-31"], "chain_positions": {"edge-chain-auto-secondary-008-31": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1491", "title": "Real-time Multi-Camera Pipeline Architecture for Hailo-8 Dataflow Accelerator", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Architect the streaming ingestion pipeline to handle frame capture, preprocessing, and continuous tensor streaming to the Hailo-8 without starving it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1492", "title": "Architecting Dataflow for Google Coral Edge TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What tiling and dataflow strategy would you use to optimize a high-resolution CNN for the Coral Edge TPU's INT8 systolic array?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1493", "title": "Thermal Management for a 75W Edge Accelerator", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you balance active cooling, DVFS, and workload shedding to sustain throughput at 45°C on the 75W accelerator?", "chain_ids": ["edge-chain-auto-secondary-006-17"], "chain_positions": {"edge-chain-auto-secondary-006-17": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1494", "title": "Architecting Unified Memory for Concurrent Edge Models", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you budget the Jetson Orin's 32 GB shared memory so the 14B LLM and ViT never OOM during concurrent inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1495", "title": "Architecting a Multi-Camera Perception System for Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What CNN architecture and GPU/DLA mapping would you use for six 4K 30 FPS streams on Jetson Orin without becoming memory-bound?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1496", "title": "Hailo-8 Dataflow Optimization and Memory-Bound Operator Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the fusion strategy for these memory-bound operations to minimize latency and host memory access on the Hailo-8?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 2}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1497", "title": "Edge TPU Fleet CI/CD Architecture", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you build the CI/CD pipeline to quantize, compile, validate, and safely OTA-deploy Coral Edge TPU model updates?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 2}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1498", "title": "Hardware-Aware NAS for Edge TPU Real-Time Inference", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS for Coral Edge TPU that meets 30 FPS, 2W, INT8, and operator-compatibility constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1499", "title": "Architecting a Streaming Pipeline for Hailo-8", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you use the roofline model to evaluate the tradeoffs of which operations execute on the Hailo-8 versus the host, and how does the lack of local DRAM constrain your design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1500", "title": "Jetson Orin Unified Memory Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Given the Orin's architecture and its 60W TDP budget, what is the most likely root cause of this latency bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1501", "title": "Latency Spikes in Conditional Privacy Guardrails on Hailo-8", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hardware architecture constraint causes these extreme latency spikes despite low compute utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1502", "title": "Real-Time Video Ingestion Bottleneck on Cloud AI 100", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this poor accelerator utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1503", "title": "Diagnosing DLA Memory Bottlenecks on Jetson Orin", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the likely architectural root cause of this severe memory bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1504", "title": "Diagnosing Sustained Throughput Drop on Hailo-8", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on the hardware architecture, what is the root cause of this performance collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1505", "title": "Canary Rollout CPU Fallback Degradation", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this deployment failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1506", "title": "Diagnosing Power Anomalies from Adversarial Energy Attacks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the Cloud AI 100 to hit its 75W TDP and throttle despite unchanged input resolution and frame rate?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1507", "title": "Diagnosing Throughput Collapse with Dynamic Batching on Cloud AI 100", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing this throughput collapse despite the high incoming request rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1508", "title": "Diagnosing CPU Fallback in EfficientNet on Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural feature of standard EfficientNet causes this fallback, and how is it resolved?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1509", "title": "Diagnosing Host Bottlenecks with Synthetic Data on Hailo-8", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What system-level bottleneck is leaving the accelerator idle while the host CPU is at 100% after synthetic-data-style preprocessing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1510", "title": "Hailo-8 Host Memory Streaming Validation Contention", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you diagnose the root cause of these anomalies in the data pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1511", "title": "Diagnosing INT8 Quantization Failures from Biased Calibration", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What calibration dataset curation failure caused the Jetson Orin INT8 model to fail at night while FP32 still works?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1512", "title": "Feature Map Drift Detection Power Throttling", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the power spike and thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1513", "title": "Diagnosing OOM Errors in Edge Encoder-Decoder", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1514", "title": "INT8 Quantization Bias on Coral Edge TPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What INT8 calibration failure on Coral Edge TPU likely caused the new recall disparity, and how would you verify it?", "chain_ids": ["edge-chain-auto-secondary-009-28"], "chain_positions": {"edge-chain-auto-secondary-009-28": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1515", "title": "Edge TPU Operator Fallback Diagnosis", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the hardware-level root cause of this symptom, and how do you confirm it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1516", "title": "Diagnosing Distillation Latency Spikes on Edge Accelerators", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing these latency spikes and how would you diagnose the bottleneck?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1517", "title": "Dataflow Starvation and Watchdog Triggers", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What architectural issue makes inference miss deadlines when the host writes diagnostic logs to NVMe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1518", "title": "Diagnosing Host Bottlenecks with Dataflow Accelerators", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What system-level bottleneck causes 30 FPS throughput and 100% host CPU despite the Hailo-8's 500 FPS standalone benchmark?", "chain_ids": ["edge-chain-auto-secondary-008-13"], "chain_positions": {"edge-chain-auto-secondary-008-13": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1519", "title": "Edge TPU Activation Memory Spilling and Partitioning", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the INT8 Coral Edge TPU model get partitioned into subgraphs with hundreds of milliseconds of latency?", "chain_ids": ["edge-chain-auto-secondary-003-20"], "chain_positions": {"edge-chain-auto-secondary-003-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1520", "title": "Diagnosing Data Starvation on Jetson Orin", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing this throughput bottleneck, and how would you diagnose the root cause within the data ingestion pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1521", "title": "Diagnosing Hotspots in Edge Consistent Hashing", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does one node throttle and OOM while all three nodes receive the same number of camera streams?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1522", "title": "Diagnosing Thermal Throttling on NVIDIA Jetson Orin", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of this sudden performance drop, and how do you diagnose it?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1523", "title": "Shadow vs Canary Deployment on Host-Dependent Accelerators", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which rollout strategy (timesliced shadow or fleet-subset canary) do you recommend for this specific hardware, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1524", "title": "Batching on Google Coral Edge TPU", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you use static batching or dynamic batching for 4 Coral TPU cameras at 30 FPS to minimize p99 latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1525", "title": "Dataflow Accelerator CNN Architecture Evaluation", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which CNN architecture should you choose for Hailo-8, and how would you evaluate actual on-device performance beyond FLOP count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1526", "title": "Optimizing RAG Pipeline Latency on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture do you choose, and how do you allocate resources to ensure predictable latency?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1527", "title": "Evaluating Cost and Latency Trade-offs on Google Coral Edge TPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the robots use cloud GPU inference over 5G or local Edge TPU inference for 10 FPS 24/7 object detection, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1528", "title": "Edge Data Quality Gating on Jetson Orin", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which data quality gate is better on Jetson Orin: a GPU autoencoder or ISP heuristics plus a tiny DLA INT8 classifier?", "chain_ids": ["edge-chain-auto-secondary-009-24"], "chain_positions": {"edge-chain-auto-secondary-009-24": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1529", "title": "Active Learning Strategies for On-Premise Defect Detection", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which active learning strategy do you select, and how do you justify the hardware resource tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1530", "title": "Evaluating Drift Detection on On-Premise Accelerators", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which alternative is better and why?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1531", "title": "Evaluating Host-Device DMA Strategies for High-Resolution Video Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate these alternatives and determine which architecture point maximizes overall system throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1532", "title": "Architecture Selection for Coral Edge TPU Translation", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which translation architecture should you deploy on the Coral Edge TPU given full INT8 quantization and limited operator support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1533", "title": "Evaluating Energy Efficiency of Architectures on Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which INT8 model will consume less energy per inference on the 2W Coral Edge TPU, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1534", "title": "INT8 Calibration Bias on Dataflow Edge Accelerators", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which design is better for evaluating and maintaining fairness, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1535", "title": "Evaluating Compute Migration to Dataflow Accelerator", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you partition intermediate feature maps to the CPU or compile the entire model onto the accelerator under the 3W budget, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1536", "title": "Evaluating Fail-Operational Fallbacks on Edge TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy provides better fail-operational reliability during thermal throttling?", "chain_ids": ["edge-chain-auto-secondary-011-05"], "chain_positions": {"edge-chain-auto-secondary-011-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1537", "title": "Distillation vs Pruning on Cloud AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which compression strategy is better: 80% unstructured pruning + INT8, or Knowledge distillation to a dense 15B parameter student + INT8, and why?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1538", "title": "Load Balancing Strategies for Stateful Edge Inference", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For sticky LLM sessions where p99 latency from KV-cache rebuilds matters most, which routing approach is better and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1539", "title": "Zero-Copy Memory Mapping for Multi-Stream Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory design should you use for four processes sharing the same 8 GB model on the 32 GB embedded device, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1540", "title": "Edge Accelerator Tensor Buffer Memory Pressure", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which memory management strategy better prevents OOM crashes and memory fragmentation for the accelerator tensor I/O?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1541", "title": "Evaluating Mixed-Precision Inference on Qualcomm AI 100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach is better suited for this specific hardware, and what bottlenecks dictate this decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1542", "title": "Edge TPU Operator Delegation Tradeoffs", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which alternative do you choose to stay within your power and latency budgets?", "chain_ids": ["edge-chain-auto-secondary-006-12"], "chain_positions": {"edge-chain-auto-secondary-006-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1543", "title": "GPU vs DLA Parallel Scheduling Tradeoffs", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which schedule is better for minimizing end-to-end latency, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1544", "title": "Bottleneck Analysis on Qualcomm Cloud AI 100", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which proposal is better for reducing the 45 ms frame latency to the 30 ms SLA on the Cloud AI 100, and why?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 3}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1545", "title": "Evaluating Real-Time Safety Guardrails on Jetson Orin", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which approach provides better organizational accountability and system performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1546", "title": "Edge Storage Format for Hailo-8", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which storage format should you use to keep the Hailo-8 fed from 100 MB/s eMMC, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1547", "title": "Edge TPU Audio Event Ingestion Architecture", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which streaming ingestion architecture is better for the edge anomaly detector, and why?", "chain_ids": ["edge-chain-auto-secondary-014-13"], "chain_positions": {"edge-chain-auto-secondary-014-13": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1548", "title": "Dataflow Tradeoffs on Qualcomm AI 100", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the LLM projection matrices use weight-stationary or output-stationary dataflow, and how should you tile them?", "chain_ids": ["edge-chain-auto-024-09"], "chain_positions": {"edge-chain-auto-024-09": 2}, "chain_tiers": {"edge-chain-auto-024-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1549", "title": "Sustained Throughput vs Thermal Throttling on Jetson Orin", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which strategy yields higher sustained 24-hour throughput in a 40C sealed enclosure, and why?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1550", "title": "Evaluating Edge Coreset Selection for On-Device Adaptation", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which strategy is better suited for this edge environment and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1551", "title": "Evaluating 4-bit AWQ versus INT8 on Coral Edge TPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should you adopt 4-bit AWQ or standard INT8 QAT for the Coral Edge TPU drone model, and why?", "chain_ids": ["edge-chain-auto-secondary-006-04"], "chain_positions": {"edge-chain-auto-secondary-006-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1552", "title": "Edge TPU Operator Fusion Tradeoff", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you keep Swish on the host CPU or replace it with ReLU6 for full INT8 TPU execution, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1553", "title": "ViT vs CNN Ensemble Memory Tradeoffs on Jetson Orin", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture is better suited for the Jetson Orin: the FP16 ViT-L or the INT8 EfficientNet-B7 ensemble, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1554", "title": "Optimizing Orin Pipeline Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which architecture is more likely to meet the 33 ms latency target, and what is the critical bottleneck in each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1555", "title": "Dual-Model Memory Management on Edge TPU", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "With 8 MB Coral SRAM, should you swap the 4 MB model over USB or spill 2 MB activations, and which has lower latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1556", "title": "Evaluating OTA Strategies for Qualcomm Cloud AI 100", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which OTA update strategy is better suited for this 50 Mbps constrained environment, and how is rollback handled?", "chain_ids": ["edge-chain-auto-027-18"], "chain_positions": {"edge-chain-auto-027-18": 1}, "chain_tiers": {"edge-chain-auto-027-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1557", "title": "Shadow Deployment Resource Budgeting on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much RAM and power remain for the OS and robotic control stack during the Jetson Orin shadow deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1558", "title": "Adversarial Purification on Hailo-8", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum FPS and energy per frame result from running the 40 GOPS detector plus 12 GOPS purifier on the 2.5W Hailo-8?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1559", "title": "Hailo-8 Static Batch Latency Calculation", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum batch size under the 10 ms latency budget at 50% Hailo-8 utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1560", "title": "Depthwise Separable Convolution on Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many MACs does the standard 3x3 convolution use versus depthwise separable convolution, and what is the reduction factor?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1561", "title": "RAG Concurrency and Power on Cloud AI 100", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many concurrent RAG sessions can the 32 GB Cloud AI 100 support, and what is the power budget per session at 75W?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1562", "title": "Hailo-8 INT8 FPS and Energy Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the theoretical maximum FPS and energy per frame for the 52 GOPs detector on the 2.5W Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1563", "title": "On-Device Coreset Capacity Calculation", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If each embedding is a 4096-dimensional FP16 vector, how many coreset embeddings fit in the remaining Cloud AI 100 memory?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1564", "title": "Data Anomaly Detection Throughput on Cloud AI 100", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the maximum theoretical validation throughput for a 2-million-operation INT8 model on the 400 TOPS Cloud AI 100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1565", "title": "Active Learning Inference Time on Coral TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How long will one Edge TPU take to process 2,000,000 images at 50% of its 4 TOPS peak if each image costs 8 GOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1566", "title": "Edge TPU Compute Capacity and PSI Drift Calculation", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What compute capacity remains after adding the 8 GOPS/frame autoencoder at 50 FPS, and what is the defect-bin PSI contribution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1567", "title": "Edge TPU USB Bandwidth Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the host-to-device transfer time for one 224x224x3 INT8 image over USB 2.0, and is the system compute-bound or I/O-bound?", "chain_ids": ["edge-chain-auto-secondary-008-17"], "chain_positions": {"edge-chain-auto-secondary-008-17": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1568", "title": "Encoder vs Decoder on DRAM-less Hailo-8", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Ignoring compute, what minimum host-weight streaming latency applies to encoder-only versus decoder-only processing of 128 tokens?", "chain_ids": ["edge-chain-auto-secondary-011-26"], "chain_positions": {"edge-chain-auto-secondary-011-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1569", "title": "Hailo-8 INT8 Energy Per Operation", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical energy per INT8 operation for the 26 TOPS, 2.5W Hailo-8 before host memory costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1570", "title": "On-Device Demographic Parity for Pedestrian Intention", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the Demographic Parity Difference between adults and children, and what is the TOPS overhead of a 0.5% fairness classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1571", "title": "Jetson Orin INT8 Compute and Energy Budgeting", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 50 FPS, what is the per-frame INT8 operation budget and energy per frame for a 275 TOPS, 60W Jetson Orin?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1572", "title": "Power-Constrained QoS Shedding on Hailo-8", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At a 1.0W cap, what degraded compute capacity and primary-detector FPS can the Hailo-8 sustain after shedding the secondary model?", "chain_ids": ["edge-chain-auto-secondary-011-06"], "chain_positions": {"edge-chain-auto-secondary-011-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1573", "title": "Knowledge Distillation to INT8 for Google Coral", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What max FPS and energy per inference do you get, and what INT8-aware distillation step is needed for Coral?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1574", "title": "Load Balancing Inference Across Multiple Coral TPUs", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "With 12,000 requests/s across 4 Coral TPUs, what compute load per TPU is required and can the cluster sustain it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1575", "title": "Memory-Mapped Concurrent Edge Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 4 processes on 32 GB Cloud AI 100, how much memory per process remains for KV cache and activations with independent vs mmap-shared INT8 weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1576", "title": "Host Memory Allocation and DMA Overhead for Hailo-8", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What host memory footprint is used by the 5-frame Hailo-8 ring buffer using 4KB pages and 16-byte DMA descriptors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1577", "title": "Google Coral Edge TPU INT8 Throughput", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What theoretical max FPS and FPS/W do you get on the 4 TOPS, 2W Coral Edge TPU for a 300M-MAC INT8 frame?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1578", "title": "Parallel Operator Scheduling and Energy on AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "At 80% utilization on a 400 TOPS, 75W accelerator, what are the execution time and energy for an 80 TOP INT8 block?", "chain_ids": ["edge-chain-auto-secondary-008-29"], "chain_positions": {"edge-chain-auto-secondary-008-29": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1579", "title": "USB I/O Bottleneck Analysis on Google Coral Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using 400 MB/s effective USB 3.0 bandwidth, how do compute latency and 3MB input transfer time compare for the 2 GOPS model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1580", "title": "Ampere 2:4 Structured Sparsity on Jetson Orin", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Will applying Ampere 2:4 structured sparsity allow the 180 TOPS, 20 GB INT8 model to meet the Orin's limits, and what is the sparse weight size?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 1}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1581", "title": "Edge ESG Metrics and Energy Calculation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 50% utilization on a 400 TOPS, 75W accelerator, what latency and energy per 20 TOP moderation request should you report?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1582", "title": "Edge Data Buffering and Parquet Compression", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 150 MB/s telemetry buffered for 1 minute, what are the uncompressed buffer size, compressed Parquet row group size, and 2-hour storage need?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1583", "title": "Hailo-8 Host Streaming Bandwidth and Throughput", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a 65 GOPS/frame model with 5 MB inputs, what is the theoretical maximum frame rate and minimum sustained host memory bandwidth required to avoid stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1584", "title": "Edge TPU Systolic Array Throughput Calculation", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "On a 4 TOPS, 2W Coral Edge TPU, what max FPS and energy per inference result from a 10 GOPS convolution layer?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 0}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1585", "title": "Calculate Cooling Requirements for Cloud AI 100 Deployment", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What BTU/hr thermal dissipation is required for eight 75W Cloud AI 100 cards to avoid throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1586", "title": "Estimating Maximum Throughput on Qualcomm Cloud AI 100", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For an 8 GOPS INT8 ResNet-50 on the 400 TOPS, 75W Cloud AI 100, what are the compute-bound max FPS and inferences per watt?", "chain_ids": ["edge-chain-auto-secondary-008-14"], "chain_positions": {"edge-chain-auto-secondary-008-14": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1587", "title": "TensorRT INT8 Graph Optimization on Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the theoretical maximum throughput (in FPS) for the compiled graph and the energy consumed per frame assuming the system operates at its peak 60W TDP?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1588", "title": "Hailo-8 Host Memory Bandwidth Calculation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What host memory bandwidth is needed to keep Hailo-8 at 26 TOPS for a 4 GOPs/frame model streaming 15 MB per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1589", "title": "Edge TPU CI/CD Pipeline Quantization Throughput", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical FPS should the CI/CD smoke test show for an INT8 model requiring 200 GOPS per inference on a 4 TOPS accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1590", "title": "Hailo-8 ONNX Conversion and Graph Break Bandwidth", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With a 13 GOPS Hailo-8 model and a 1 MB out/1 MB back CPU fallback, what are the max FPS and fallback host bandwidth?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1591", "title": "Memory Footprint Estimation for Qualcomm Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Does the INT8 13B model fit in 32 GB on Cloud AI 100 after reserving 4 GB for KV cache and 2 GB for workspace, and how much remains?", "chain_ids": ["edge-chain-auto-secondary-003-30"], "chain_positions": {"edge-chain-auto-secondary-003-30": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1592", "title": "Edge Telemetry Buffering on NVIDIA Jetson Orin", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 10 Hz with 2 KB snapshots, how much telemetry accumulates per day and what percent of 32 GB RAM would a 7-day buffer consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1593", "title": "Watchdog Threshold for Deterministic Execution", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What watchdog threshold should be set if the 1,200 GOPs model uses only 10% of Cloud AI 100 peak (400 TOPS) and the timer must be 20% above WCET?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1594", "title": "Transformer Attention Cost on Coral Edge TPU", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For N=256 and d=512 on a 4 TOPS Coral Edge TPU, how many INT8 ops does QK^T require and what is the minimum latency?", "chain_ids": ["edge-chain-auto-017-12"], "chain_positions": {"edge-chain-auto-017-12": 0}, "chain_tiers": {"edge-chain-auto-017-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1595", "title": "Hailo-8 4-Bit Streaming Bandwidth", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What sustained host-to-accelerator bandwidth is required to stream a 1.5B-parameter 4-bit AWQ model once per token at 20 tokens/s?", "chain_ids": ["edge-chain-auto-secondary-006-02"], "chain_positions": {"edge-chain-auto-secondary-006-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1596", "title": "Dataflow Graph Splitting Overhead", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming the Conv2D is 52 GOPS, what latency penalty does the 1920x1080x16 INT8 round trip add over a 2 GB/s PCIe link?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 0}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1597", "title": "Cloud AI 100 Latency Decomposition", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the TTFT and total end-to-end latency for the Cloud AI 100 service with 2 TOP prefill, 50 tokens at 15 ms TPOT, and the given overheads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1598", "title": "Shadow Deployment Bandwidth on Hailo-8", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 30 FPS shadow mode on Hailo-8, what host memory bandwidth is needed for two models each moving 6 MB input, 4 MB weights, and 2 MB outputs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1599", "title": "Calculate Efficiency for GPU vs DLA on Orin", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the DLA and GPU TOPS/W efficiencies, and what max FPS can the DLA deliver for a 2.5 TOPS/frame model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1600", "title": "Hailo-8 Activation Spilling Bandwidth Calculation", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For a 512x512x64 INT8 activation spilled and restored at 30 FPS on Hailo-8, what sustained host memory bandwidth is required?", "chain_ids": ["edge-chain-auto-secondary-003-19"], "chain_positions": {"edge-chain-auto-secondary-003-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1601", "title": "Edge TPU Randomized Smoothing Throughput Calculation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "With randomized smoothing requiring N=10 inferences per frame, what effective FPS and FPS/W can a 4 TOPS, 2W Coral TPU deliver for a 50 GOPS model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1602", "title": "Hailo-8 MobileNetV2 Block Compute and Memory", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For the MobileNetV2 block on Hailo-8, what are the total INT8 ops and minimum host transfer bytes when intermediates stay on-chip?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1603", "title": "Agent Orchestration Memory Footprint on Jetson Orin", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given 32 GB RAM, a 7B INT8 LLM, 2 GB ViT, 1 GB Whisper, 4 GB OS, and 1 MB/token KV cache, what max context window fits?", "chain_ids": ["edge-chain-auto-secondary-002-09"], "chain_positions": {"edge-chain-auto-secondary-002-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1604", "title": "Edge TPU Inference Throughput and Energy Calculation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical max FPS and energy per inference in mJ does the 4 TOPS, 2W accelerator achieve for a 2 GOPS INT8 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1605", "title": "Data Pruning Impact on Edge Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much training compute time per epoch is saved by using a 10% coreset instead of the full 100,000-image dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1606", "title": "Data Bottleneck on Qualcomm AI 100", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 6 MB frames over 12 GB/s PCIe and a 100 GOPS model on a 400 TOPS accelerator, what is the max FPS and the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1607", "title": "Edge Data Quality Gate Compute Utilization", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What percentage of Jetson Orin's 275 TOPS INT8 budget is consumed by a 10 GOPs/frame quality gate on 4 cameras at 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-009-24"], "chain_positions": {"edge-chain-auto-secondary-009-24": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1608", "title": "Calculate On-Device PSI for Edge Accelerator", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact PSI for Reference=[0.5,0.3,0.2] and Serving=[0.4,0.4,0.2] using the standard binned formula?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1609", "title": "DMA Overhead vs Compute on Cloud AI 100", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming no pipelining between transfer and compute, what are the DMA transfer time, compute time, and their ratio for the 16 uncompressed 4K frames?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1610", "title": "Edge TPU Encoder-Decoder Throughput Calculation", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum inferences per second can a 4 TOPS NPU achieve when each seq2seq request needs a 15 GOPS encoder plus 20x2 GOPS decoder?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1611", "title": "Calculate Inference Energy on Google Coral Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical energy per inference in millijoules (mJ) and the maximum theoretical frames per second (FPS)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1612", "title": "Runtime Memory for 4-bit Weights on Edge TPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the exact runtime weight memory footprint required when the 10M-parameter 4-bit model is unpacked to INT8 for Coral Edge TPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1613", "title": "On-Device Fairness Audit Calculation for Hailo-8", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the exact absolute Demographic Parity Difference and theoretical minimum audit processing time on Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1614", "title": "Hailo-8 Dataflow Streaming Bandwidth and Throughput Calculation", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What theoretical maximum FPS and minimum host-memory read bandwidth are needed for the 13 GOPS/frame, 1 MB input model on Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1615", "title": "Edge TPU Fallback Model Compute Budget", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum fallback-model compute budget in GOPS per frame to sustain 20 FPS after throttling to 1 TOPS?", "chain_ids": ["edge-chain-auto-secondary-011-05"], "chain_positions": {"edge-chain-auto-secondary-011-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1616", "title": "Hailo-8 Graph Compilation and Power Efficiency Calculation", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the absolute FPS/Watt improvement from reducing the graph to 13 GOPS/frame and raising MAC utilization to 80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1617", "title": "Distillation Projection Footprint on AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What are the exact INT8 memory footprint of the 1024-to-4096 projection layer in bytes (including bias), and what fraction of the 32 GB memory does it consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1618", "title": "Hailo-8 End-to-End Latency Decomposition for Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total end-to-end latency per frame assuming a 26 TOPS peak accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1619", "title": "Weighted Round-Robin Routing for Heterogeneous Edge Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What minimal integer weighted round-robin weights should each accelerator get, and what total request capacity supports 0.5 TOPS/request?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1620", "title": "Calculate memory bound latency for INT8 inference on Cloud AI 100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the theoretical minimum batch-1 single-token generation latency for the 8B INT8 model at 136 GB/s memory bandwidth?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1621", "title": "Calculate Mmap Cold Start Latency on Jetson Orin", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the absolute minimum cold-start time to page the 15 GB INT8 model from 2.5 GB/s NVMe into Jetson memory?", "chain_ids": ["edge-chain-auto-secondary-008-25"], "chain_positions": {"edge-chain-auto-secondary-008-25": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1622", "title": "Coral Edge TPU Activation Memory Sizing", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum safe batch size given 8 MB SRAM, 4.8 MB weights, 0.2 MB driver reserve, and 750 KB activations per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1623", "title": "Calculate Precision Throughput Delta on Cloud AI 100", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the absolute difference in throughput (FPS) when quantizing the model from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1624", "title": "CI/CD Performance Gating for Qualcomm Cloud AI 100", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What IPS threshold should the CI/CD gate use for the 500 GOPS/inference INT8 model at 75% Cloud AI 100 utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1625", "title": "CPU Fallback Bottleneck on Edge TPU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical maximum FPS results when 20% of the 10 GOPS model falls back to a 100 GOPS CPU and execution is sequential?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1626", "title": "Jetson Orin LLM Memory Footprint", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much memory is required for the 15B INT8 weights, and how much remains for KV cache and activations after the 8 GB reserve on a 32 GB system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1627", "title": "Hailo-8 Host Memory Bandwidth Alerting", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What continuous PCIe bandwidth is needed for 60 FPS 1080p RGB input streaming, and what baseline utilization is that of the 500 MB/s limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1628", "title": "Hardware-Aware NAS Compute Constraint Calculation", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the absolute maximum number of operations (in GOps) a candidate architecture can have per frame to meet the 10,000 FPS constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1629", "title": "Heterogeneous Parallel Branch Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the theoretical compute latencies if Branch A and Branch B run sequentially versus concurrently on the DLA and GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1630", "title": "Compute-Bound Latency on Cloud AI 100", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected execution latency in milliseconds for the 200 GOPs projection layer at 50% of 400 TOPS INT8 peak?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1631", "title": "Calculating Streaming Bandwidth for Pruned Hailo-8 Models", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What host memory bandwidth is required to stream weights after uniform 50% structured channel pruning at 50 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1632", "title": "Watchdog Timeout Calculation Under Contention", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum allowable base GPU inference time so the 50ms watchdog is never tripped under contention and jitter?", "chain_ids": ["edge-chain-auto-secondary-008-02"], "chain_positions": {"edge-chain-auto-secondary-008-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1633", "title": "Calculate Edge TPU Ingestion Throughput for Video Stream", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming the accelerator operates at 4 TOPS (INT8) and has a 2W power envelope, what maximum theoretical FPS can it process, and will USB 3.0 bottleneck the raw 1080p RGB 30 FPS stream?", "chain_ids": ["edge-chain-auto-secondary-014-13"], "chain_positions": {"edge-chain-auto-secondary-014-13": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1634", "title": "Dataflow Latency and Energy on AI 100", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the expected execution latency in milliseconds and the energy consumed in millijoules for this single layer?", "chain_ids": ["edge-chain-auto-024-09"], "chain_positions": {"edge-chain-auto-024-09": 0}, "chain_tiers": {"edge-chain-auto-024-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1635", "title": "Orin AGX Thermal-Limited FPS Estimate", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum sustained FPS can the Jetson Orin run indefinitely when limited to 30W and each inference costs 2 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1636", "title": "USB Latency Overhead in Edge TPU Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total latency per frame if the 4 cameras are processed sequentially (batch size 1) versus batched together over USB 2.0?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1637", "title": "Host Round-Trip Latency in Edge TPU Operator Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much USB communication time per inference is saved by fusing the 4 MB fallback activation so it stays on the Edge TPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1638", "title": "Calculate Maximum Context Length on Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Given the model has 32 layers, 32 KV heads, and a head dimension of 128, what is the maximum batch-1 context length (in tokens) assuming an FP16 KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1639", "title": "Shadow Deployment Arbitration on Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design an on-device shadow deployment architecture that safely runs both models concurrently without violating constraints?", "chain_ids": ["edge-chain-auto-secondary-011-02"], "chain_positions": {"edge-chain-auto-secondary-011-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1640", "title": "On-Device LoRA with Heterogeneous Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect LoRA fine-tuning on Orin to fit the 8B INT8 transformer, 45 GB naive activations, 32 GB RAM, and 60W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1641", "title": "Dataflow Adversarial Defense on Hailo-8 Edge Cameras", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What Hailo-8-friendly adversarial patch defense would maintain 30 FPS without creating host-memory bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1642", "title": "Dataflow-Aware Dynamic Batching for Multi-Camera Hailo-8 Edge Appliances", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you dynamically batch 4 asynchronous 1080p 30 FPS camera streams on a DRAM-less Hailo-8 while meeting the 33 ms target?", "chain_ids": ["edge-chain-auto-secondary-005-10"], "chain_positions": {"edge-chain-auto-secondary-005-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1643", "title": "Heterogeneous CNN Design for Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What CNN architecture would you design to map 4K 60 FPS perception across Orin's GPU and DLA while maximizing INT8 throughput within 60W?", "chain_ids": ["edge-chain-auto-secondary-002-08"], "chain_positions": {"edge-chain-auto-secondary-002-08": 4}, "chain_tiers": {"edge-chain-auto-secondary-002-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1644", "title": "Cloud AI 100 8B LLM Serving at 20 RPS Under 1 Second", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you serve the router, retriever, and 8B LLM on one Cloud AI 100 to handle 20 RPS within 1 second, and what bottlenecks dominate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1645", "title": "Multi-Model Dataflow Compute and PCIe Bandwidth Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you fuse the detector, depth estimator, and segmenter on Hailo-8, and what host bandwidth and compute utilization would result?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 4}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1646", "title": "On-Premise Coreset Selection Pipeline", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a concurrent coreset selection pipeline that prunes 1,000 FPS video to 10 FPS for cloud retraining without breaking real-time SLAs?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1647", "title": "Quantization-Aware Active Learning Pipeline for Edge TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you curate and annotate data to fix Edge TPU INT8 quantization failures on rare defects, and what mining throughput is required?", "chain_ids": ["edge-chain-auto-secondary-003-23"], "chain_positions": {"edge-chain-auto-secondary-003-23": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1648", "title": "On-Device Distribution Drift Detection for Edge TPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you detect statistical data drift on-device using the Coral Edge TPU without streaming raw images or disrupting INT8 inference?", "chain_ids": ["edge-chain-auto-secondary-009-27"], "chain_positions": {"edge-chain-auto-secondary-009-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1649", "title": "Bypassing M.2 Edge TPU PCIe Bottlenecks", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What memory and data-movement architecture would make 4 HD 30 FPS camera streams real-time on a PCIe Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1650", "title": "Dataflow Optimization for Sequenced Output", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which sequence model architecture and host-device streaming strategy would maximize throughput on the DRAM-less dataflow accelerator?", "chain_ids": ["edge-chain-auto-secondary-011-26"], "chain_positions": {"edge-chain-auto-secondary-011-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1651", "title": "Dataflow Optimization for Hailo-8 Stream Processing", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design an object detection and tracking pipeline on Hailo-8 to minimize energy per inference by reducing host memory access?", "chain_ids": ["edge-chain-auto-secondary-005-11"], "chain_positions": {"edge-chain-auto-secondary-005-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-005-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1652", "title": "Hailo-8 Supported INT8 Quantization for Transformers", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you cut Hailo-8 transformer bandwidth and recover accuracy within supported INT8 quantization flows?", "chain_ids": ["edge-chain-auto-secondary-006-02"], "chain_positions": {"edge-chain-auto-secondary-006-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1653", "title": "On-Device Intersectional Fairness Auditing at the Edge", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a continuous, on-device fairness audit for 16 subgroups without exceeding the 60W TDP or disrupting the primary 30 FPS inference pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1654", "title": "Hybrid On-Device FL using Hailo-8 Dataflow Accelerator", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you use the INT8 inference-only accelerator to run local federated training without stalling the host CPU?", "chain_ids": ["edge-chain-auto-017-02"], "chain_positions": {"edge-chain-auto-017-02": 2}, "chain_tiers": {"edge-chain-auto-017-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1655", "title": "Heterogeneous Pipeline Design for Jetson Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition a massive transformer perception model across the Ampere GPU and DLA, and design kernels to avoid LPDDR5 bottlenecks?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1656", "title": "Heterogeneous Graph Compilation on Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What graph partitioning and operator lowering strategy would you use to hit 120 FPS within the Jetson Orin's 60W TDP?", "chain_ids": ["edge-chain-auto-secondary-006-26"], "chain_positions": {"edge-chain-auto-secondary-006-26": 4}, "chain_tiers": {"edge-chain-auto-secondary-006-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1657", "title": "Cross-Architecture Distillation for Google Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you distill the ViT into an Edge TPU-compatible INT8 CNN that runs fully on the TPU while preserving accuracy?", "chain_ids": ["edge-chain-auto-secondary-008-19"], "chain_positions": {"edge-chain-auto-secondary-008-19": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1658", "title": "Edge TPU Cluster Load Balancing for Real-Time Video Analytics", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you route and load-balance 8 camera streams across 4 Coral Edge TPUs while handling bursts and frame drops?", "chain_ids": ["edge-chain-auto-secondary-006-07"], "chain_positions": {"edge-chain-auto-secondary-006-07": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1659", "title": "Dataflow Streaming on Host-Dependent Accelerators", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the Hailo-8 memory hierarchy and streaming strategy for 4 1080p30 streams without stalling PCIe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1660", "title": "Cached Model Switching for Edge Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you serve 15 unpredictable 4 GB models across 8 parallel workers without OOMs or long cold starts?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1661", "title": "Zero-Copy Host Streaming Architecture for Hailo-8 Memory", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign host memory management to stop sporadic OOM kills while preserving zero-copy Hailo-8 streaming?", "chain_ids": ["edge-chain-auto-secondary-011-08"], "chain_positions": {"edge-chain-auto-secondary-011-08": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1662", "title": "Edge TPU QAT and Mixed-Precision Pipeline Design", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a BF16/FP32 QAT pipeline that simulates strict INT8 Edge TPU execution and avoids accuracy loss?", "chain_ids": ["edge-chain-auto-secondary-011-28"], "chain_positions": {"edge-chain-auto-secondary-011-28": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1663", "title": "Hailo-8 Dataflow Compilation with Unsupported Ops", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you convert and deploy the unsupported activation and attention operators without breaking Hailo-8 dataflow efficiency?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1664", "title": "Multi-Tenant LLM Serving on Cloud AI 100", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you serve the 8B chat model and 14B summarization model concurrently on one Cloud AI 100 while meeting chat SLAs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1665", "title": "LLM Inference Sizing for Cloud AI 100", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What quantization and memory-management plan makes a 14B LLM with 2048-token contexts and 16 users feasible on one 32 GB Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-003-30"], "chain_positions": {"edge-chain-auto-secondary-003-30": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1666", "title": "Autonomous Fleet Telemetry on Jetson Orin", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design telemetry for Jetson Orin robots that can buffer 12 hours offline without starving the perception stack or memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1667", "title": "Hardware-Aware NAS for Edge TPU Object Detection", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS framework to discover the optimal network topology for this specific accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1668", "title": "Optimal LLM Operator Scheduling on Qualcomm Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you schedule a 15B INT8 LLM on Cloud AI 100 to reuse weights and hide KV-cache traffic within 32 GB LPDDR4x?", "chain_ids": ["edge-chain-auto-secondary-008-29"], "chain_positions": {"edge-chain-auto-secondary-008-29": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1669", "title": "Dynamic Thermal Throttling on Jetson Orin", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition workloads and use DVFS P-states to keep the perception system under a 40W sustained power cap?", "chain_ids": ["edge-chain-auto-secondary-002-11"], "chain_positions": {"edge-chain-auto-secondary-002-11": 4}, "chain_tiers": {"edge-chain-auto-secondary-002-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1670", "title": "Edge TPU Pipeline Profiling and Bottleneck Resolution", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you eliminate the Coral Edge TPU CPU fallback and redesign the pipeline to meet the 33ms frame target?", "chain_ids": ["edge-chain-auto-secondary-010-02"], "chain_positions": {"edge-chain-auto-secondary-010-02": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1671", "title": "Mixed-Precision LLM Architecture for a 32 GB INT8 Edge Accelerator", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What mixed-precision quantization architecture would fit a 35B MoE LLM into 32 GB while using the accelerator's INT8 engines?", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 3}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1672", "title": "On-Premise Guardrail Architecture for High-Throughput Edge Inference", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you co-locate a 14B summarization LLM with PII and toxicity guardrails under a 150ms on-prem latency SLA?", "chain_ids": ["edge-chain-auto-secondary-010-05"], "chain_positions": {"edge-chain-auto-secondary-010-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1673", "title": "Architecting a Streaming Pipeline for Dataflow Roofline Optimization", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the Hailo-8 host streaming path, analyze the roofline, and restructure YOLOX for 4-camera 1080p60 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1674", "title": "Architecting ISO-26262 Compliant Vision on Coral Edge TPU", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an architecture that provides deterministic execution, continuous self-testing, and fail-safe handling without violating the 50ms ASIL-B latency?", "chain_ids": ["edge-chain-auto-secondary-008-01"], "chain_positions": {"edge-chain-auto-secondary-008-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1675", "title": "Multi-modal Sensor Data Ingestion Architecture on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a novel hybrid storage format and tiering strategy to serialize and persist this heterogeneous data without stalling the inference pipeline or exceeding 60W?", "chain_ids": ["edge-chain-auto-secondary-008-31"], "chain_positions": {"edge-chain-auto-secondary-008-31": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1676", "title": "Edge TPU Systolic Dataflow for Depthwise Convolutions", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign dataflow, tiling, and operators so the drone tracker fully uses the TPU without CPU fallbacks?", "chain_ids": ["edge-chain-auto-024-08"], "chain_positions": {"edge-chain-auto-024-08": 3}, "chain_tiers": {"edge-chain-auto-024-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1677", "title": "Architecting Thermal Resilient Video Analytics", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you schedule 30 HD camera streams across two Cloud AI 100 cards to meet latency SLAs under 45°C thermal throttling?", "chain_ids": ["edge-chain-auto-secondary-006-17"], "chain_positions": {"edge-chain-auto-secondary-006-17": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1678", "title": "Hailo-8 Streaming KV-Cache Architecture for Long-Context Transformers", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you redesign attention dataflow for a 1B VLM on Hailo-8 to avoid streaming a 4K-token KV cache every generation step?", "chain_ids": ["edge-chain-auto-017-11"], "chain_positions": {"edge-chain-auto-017-11": 2}, "chain_tiers": {"edge-chain-auto-017-11": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1679", "title": "Unified Memory Multimodal Perceptor", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you partition Jetson Orin's 32 GB unified memory for a 13B LLM, 2B vision encoder, and 4000-token history without OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1680", "title": "Zero-Copy Multi-Camera Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you build a zero-copy ISP-to-GPU/DLA pipeline for 8 concurrent 4K30 streams on Jetson Orin without starving inference?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 5}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1681", "title": "CI/CD Pipeline for Fleet-Wide Coral Edge TPU Deployment", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a CI/CD pipeline for 10,000 edge cameras to quantize, verify operator mapping, and safely roll out models?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 3}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1682", "title": "Dual-Bank OTA Architecture for Coral Edge TPU Ensembles", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a zero-downtime A/B OTA scheme for 5,000 Coral devices that atomically updates the app and all three models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1683", "title": "Canary Rollout Context Thrashing on AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the structural bottleneck of this traffic splitting approach on the Cloud AI 100, and how would you quantify a fix?", "chain_ids": ["edge-chain-auto-secondary-011-03"], "chain_positions": {"edge-chain-auto-secondary-011-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1684", "title": "On-Device LoRA Gradient Checkpointing Tradeoffs", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantify activation checkpointing so this LoRA fine-tuning job fits in 32 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1685", "title": "Optimizing Adversarial Purification on Jetson Orin", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the hardware bottleneck and quantify the performance gain of a deployment optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1686", "title": "Dynamic Batching Latency Optimization on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 80ms tail latency from dynamic batching, and what batching strategy would meet the 33.3ms deadline?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1687", "title": "Optimizing CNNs for Qualcomm Cloud AI 100 Bottlenecks", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you redesign the architecture to alleviate this memory bandwidth bottleneck, and how do you quantify the reduction in memory traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1688", "title": "Edge TPU Multi-Model Pipeline Fallback Optimization", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the Coral Edge TPU OCR bottleneck to ensure the detector-plus-OCR pipeline reaches 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1689", "title": "Optimizing Transformer Inference on Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would INT8 quantization and DLA utilization change the compute, memory, and power profile of the 2.5B tracking transformer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1690", "title": "Edge TPU INT8 Calibration Coreset Optimization", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you systematically optimize the calibration data selection to diagnose and fix this quantization bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1691", "title": "Optimizing Data Validation Pipelines for Coral Edge TPU", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you move data validation and anomaly detection from the host CPU to the Coral Edge TPU and quantify the throughput gain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1692", "title": "Optimizing Edge Active Learning Curation", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you reduce Hailo-8 active learning storage and host bandwidth by selecting only semantically novel frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1693", "title": "Optimizing Drift Detection on Hailo-8 Dataflow Accelerator", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose this bottleneck and optimize the drift detection strategy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1694", "title": "Hailo-8 Zero-Copy Stream Optimization", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What host-device data movement bottleneck stalls the pipeline, and how would zero-copy DMA fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1695", "title": "Optimizing Encoder-Decoder Latency on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the compute and memory bottlenecks for the decoder, and what optimizations would you apply to resolve this bottleneck?", "chain_ids": ["edge-chain-auto-secondary-011-27"], "chain_positions": {"edge-chain-auto-secondary-011-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1696", "title": "Optimizing Intersectional Fairness Evaluation", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze this bottleneck and optimize the pipeline to ensure fairness metrics are calculated efficiently without exceeding the power budget or memory limits?", "chain_ids": ["edge-chain-auto-secondary-009-30"], "chain_positions": {"edge-chain-auto-secondary-009-30": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1697", "title": "Optimizing INT8 Compute Utilization on Qualcomm AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would a roofline analysis quantify the 5% utilization bottleneck and guide fusion, coalescing, and batching?", "chain_ids": ["edge-chain-auto-secondary-007-23"], "chain_positions": {"edge-chain-auto-secondary-007-23": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-23": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1698", "title": "Thermal Throttling Degradation on Jetson Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What degradation ladder would you use after a 40% GPU clock throttle to keep the perception pipeline at a safe 20 FPS?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1699", "title": "Operator Fusion Bottlenecks on Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this compiler bottleneck and quantify the impact of operator lowering and fusion?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1700", "title": "Feature Distillation I/O Bottleneck", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the bottleneck caused by the 16MB intermediate activation on Hailo-8, and what distillation architecture change would remove it?", "chain_ids": ["edge-chain-auto-secondary-008-21"], "chain_positions": {"edge-chain-auto-secondary-008-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1701", "title": "Google Coral USB Latency Pipeline Optimization", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where is the hidden latency in the 85ms Coral Edge TPU pipeline, and how would you optimize it to meet the 33.3ms frame budget?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1702", "title": "Stream Routing and PCIe Bottleneck Analysis", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does round-robin routing stall the 16-camera system, and how much would stream-affinity routing improve throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1703", "title": "Optimizing Cold Start with Memory-Mapped Inference on Coral TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would memory-mapped loading reduce cold-start latency when multiple processes load the same 20MB Coral Edge TPU model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1704", "title": "Resolving Concurrent OOM on Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the memory bottleneck and quantify a structural fix to prevent OOM without exceeding the 60W TDP limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1705", "title": "Dataflow Pipeline Bottlenecks in Mixed-Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the system-level cost of host FP16 fallback on Hailo-8, and how would you keep the whole model on the INT8 dataflow path?", "chain_ids": ["edge-chain-auto-secondary-011-29"], "chain_positions": {"edge-chain-auto-secondary-011-29": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1706", "title": "TensorRT DLA to GPU Fallback Optimization", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the operator coverage gap causing this fallback and quantify the performance cost?", "chain_ids": ["edge-chain-auto-secondary-006-10"], "chain_positions": {"edge-chain-auto-secondary-006-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1707", "title": "Edge TPU Model Quantization and Footprint Optimization", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose Edge TPU CPU fallbacks in the MobileNetV2 graph and verify the fully INT8 model fits and runs efficiently?", "chain_ids": ["edge-chain-auto-secondary-003-29"], "chain_positions": {"edge-chain-auto-secondary-003-29": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1708", "title": "Optimizing Telemetry Overhead on Qualcomm Cloud AI 100", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze this observability overhead and design an optimized telemetry strategy without losing visibility into stragglers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1709", "title": "Operator Scheduling and Fusion on Edge TPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What scheduling bottleneck is caused by the unsupported custom operator, and how much latency is saved by making the graph fully TPU-compatible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1710", "title": "Optimizing Hailo-8 OTA Updates", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you prevent a 50MB OTA update over 5 Mbps from starving Hailo-8 inference on the shared host memory bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1711", "title": "Hailo-8 Host Streaming Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix the host-side bottleneck keeping the 1080p60 pipeline at 20 FPS?", "chain_ids": ["edge-chain-auto-secondary-010-01"], "chain_positions": {"edge-chain-auto-secondary-010-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1712", "title": "Optimizing Privacy Guardrails on Edge TPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the Coral privacy model's 80ms latency spike, and how would you refactor it to be fully TPU-compatible?", "chain_ids": ["edge-chain-auto-secondary-010-04"], "chain_positions": {"edge-chain-auto-secondary-010-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-010-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1713", "title": "Storage I/O Optimization for Qualcomm AI 100", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What data-loading bottleneck is leaving the Cloud AI 100 at 10% utilization, and what storage format would keep it fed?", "chain_ids": ["edge-chain-auto-secondary-008-34"], "chain_positions": {"edge-chain-auto-secondary-008-34": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1714", "title": "Optimizing High-Res Camera Ingestion on Jetson Orin", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What ingestion bottleneck is dropping six 4K camera streams on Jetson Orin, and how would zero-copy NVMM buffers reduce it?", "chain_ids": ["edge-chain-auto-secondary-014-11"], "chain_positions": {"edge-chain-auto-secondary-014-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1715", "title": "Hailo-8 Spatial Tiling and Host Bandwidth", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you analyze the host memory bottleneck and quantify the impact of a depth-first dataflow tiling strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1716", "title": "Mitigating Edge TPU Thermal Throttling in Sealed Enclosures", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal bottleneck is causing the device to drop from 30 to 15 FPS, and how would you sustain at least 20 FPS?", "chain_ids": ["edge-chain-auto-secondary-006-16"], "chain_positions": {"edge-chain-auto-secondary-006-16": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1717", "title": "Optimizing KV Cache for Long-Context on Orin", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Diagnose the architectural bottleneck causing this degradation and quantify the optimization impact of using Grouped-Query Attention (GQA) and INT8 KV-cache quantization?", "chain_ids": ["edge-chain-auto-017-10"], "chain_positions": {"edge-chain-auto-017-10": 1}, "chain_tiers": {"edge-chain-auto-017-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1718", "title": "Google Coral Edge TPU Subgraph Optimization", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the Coral Edge TPU model reaching only 10 FPS after quantization, and how would you eliminate CPU fallbacks to hit 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1719", "title": "Mitigating Bandwidth Bottlenecks for 4-bit AWQ on Jetson Orin", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the 4-bit AWQ 7B model slow during decoding on Jetson Orin, and what fused-kernel fix would remove the bottleneck?", "chain_ids": ["edge-chain-auto-secondary-006-01"], "chain_positions": {"edge-chain-auto-secondary-006-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1720", "title": "Mitigating Federated Communication Bottlenecks on Jetson Orin", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is stalling federated averaging on Jetson Orin, and how would INT8 plus top-k update compression reduce LTE time and power?", "chain_ids": ["edge-chain-auto-017-01"], "chain_positions": {"edge-chain-auto-017-01": 1}, "chain_tiers": {"edge-chain-auto-017-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1721", "title": "Optimizing ViT Activation Memory Bandwidth on Jetson Orin", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you diagnose the exact memory hierarchy constraint and quantify the impact of operator fusion to hit the framerate target within the 60W TDP?", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 3}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1722", "title": "Optimizing CI/CD Deployment Pipelines for Hailo-8 Dataflow Architectures", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What deployment bottleneck can drop Hailo-8 throughput despite unchanged INT8 ops, and how should CI/CD configure host streaming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1723", "title": "Edge TPU Multi-Model Cache Thrashing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the Coral cascade's 80% idle time and fix SRAM weight thrashing between the 5.5MB and 4.5MB models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1724", "title": "Canary Rollout of INT8 Model on Coral Edge TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you structure the rollout, and what specific telemetry do you monitor to decide whether to advance or rollback the deployment?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1725", "title": "Dataflow Accelerator Viability for Multi-Camera Edge Pipeline", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the dataflow accelerator sustain four 1080p30 streams at 50 GOPS per frame, and what system-level constraints drive the decision?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1726", "title": "Edge TPU Activation Memory Constraint Evaluation", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "When Edge TPU activations exceed SRAM, how would you choose among downsampling, CPU/TPU splitting, and receptive-field changes?", "chain_ids": ["edge-chain-auto-secondary-003-20"], "chain_positions": {"edge-chain-auto-secondary-003-20": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1727", "title": "Adversarial Input Purification on Qualcomm Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is an INT8 input purification network feasible on the Cloud AI 100, and what compute and memory overhead would it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1728", "title": "Sizing Dynamic Batching for LLM Prefill on Edge", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is the Cloud AI 100 compute-bound or memory-bound during prefill, and what dynamic-batching limit meets the 200ms TTFT SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1729", "title": "Sizing MobileNetV2 for Google Coral Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose the MobileNetV2 input resolution and depth multiplier to achieve 30 FPS within Coral Edge TPU constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1730", "title": "Multi-Model Pipelining on Hailo-8 Dataflow Architecture", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the detector and OCR networks be swapped sequentially or co-compiled on Hailo-8 to meet 30 FPS, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1731", "title": "Scaling Video Analytics on Qualcomm Cloud AI 100", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many accelerators are needed for 500 streams at 150 GOPS and 30 FPS, and what is the peak power draw?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1732", "title": "Data Pruning for Hailo-8 Streaming Limits", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should Hailo-8 bandwidth limits shape data pruning for a compact 30 FPS detector?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1733", "title": "Optimizing 4K Image Pipelines for Cloud AI 100", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you partition decoding, resizing, and normalization between the host and accelerator to keep 4K 120 FPS inference from starving?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 3}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1734", "title": "Streaming Data Validation for Hailo-8 Edge Inference", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the architectural tradeoffs between performing lightweight data quality checks on the host CPU versus running a small validation model on the Hailo-8 for 4x 1080p 30 FPS streams?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 3}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1735", "title": "On-Device Active Learning Data Selection", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you size and place uncertainty sampling on Jetson Orin while respecting memory, DLA limits, and a 5 Mbps uplink?", "chain_ids": ["edge-chain-auto-secondary-003-22"], "chain_positions": {"edge-chain-auto-secondary-003-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1736", "title": "Resource-Constrained Drift Detection on Jetson Orin", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you implement real-time data drift detection on Jetson Orin without disrupting the main GPU or DLA inference within 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1737", "title": "Zero-Copy Pipeline Design on Unified Edge Architectures", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you redesign the four-4K-camera Jetson Orin pipeline to use zero-copy buffers and eliminate CPU, GPU, and DLA transfer overhead?", "chain_ids": ["edge-chain-auto-secondary-008-15"], "chain_positions": {"edge-chain-auto-secondary-008-15": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1738", "title": "Encoder-Decoder Sizing on Qualcomm AI 100", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which translation model would you deploy on Cloud AI 100, the 7B decoder-only or 3B encoder-decoder, and how would you batch it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1739", "title": "Energy-Aware INT8 Throughput Under a 75W LPDDR4x Budget", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design the execution strategy to maximize INT8 throughput under 75W while minimizing LPDDR4x access energy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1740", "title": "Sizing a 70B LLM for Qualcomm Cloud AI 100", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What extreme quantization scheme would fit a 70B LLM and KV cache into 32 GB on the Cloud AI 100, and what are the runtime tradeoffs?", "chain_ids": ["edge-chain-auto-secondary-006-03"], "chain_positions": {"edge-chain-auto-secondary-006-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1741", "title": "Smart Doorbell INT8 Fairness Evaluation", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate intersectional fairness for the INT8 face detector under the 2W deployment constraints?", "chain_ids": ["edge-chain-auto-secondary-009-28"], "chain_positions": {"edge-chain-auto-secondary-009-28": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1742", "title": "Evaluating GPU to Edge TPU Migration", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you migrate the 20 GOPS, 50 FPS detector from an FP16 GPU to the Coral Edge TPU and verify it fits the 4 TOPS INT8 limit?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1743", "title": "Dual-Model QoS Shedding on a 32 GB Edge Accelerator", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a graceful degradation ladder on a 32 GB accelerator to preserve premium quality while serving free-tier fallback traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1744", "title": "Fused Attention Optimization on Qualcomm Cloud AI 100", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What fused attention strategy would you use on Cloud AI 100 to remove LPDDR4x round-trips, and how much bandwidth would it save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1745", "title": "Jetson Orin DLA-GPU CNN-Transformer Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design feature distillation and size the student CNN-Transformer to hit 30 FPS while using the Orin's Ampere GPU and DLA efficiently?", "chain_ids": ["edge-chain-auto-secondary-008-22"], "chain_positions": {"edge-chain-auto-secondary-008-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1746", "title": "Heterogeneous Routing on NVIDIA Jetson Orin", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you route 24 HD video streams across Jetson Orin's GPU and DLAs to maximize throughput while staying within 60W?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1747", "title": "Hailo-8 Multi-Process Memory-Mapped Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you share the 120 MB Hailo-8 backbone across three containers while staying under a 200 MB host memory budget?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1748", "title": "Sizing Paged Memory for Multi-Tenant LLM Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design continuous batching for a 13B INT8 model on 32 GB Cloud AI 100 to avoid KV-cache fragmentation and OOMs?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1749", "title": "Mixed-Precision Perception on Jetson Orin", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a mixed-precision Jetson Orin deployment for the 5B multimodal model to meet 4-camera 30 FPS under 60W?", "chain_ids": ["edge-chain-auto-secondary-011-30"], "chain_positions": {"edge-chain-auto-secondary-011-30": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1750", "title": "Hardware-Aware Shadow Deployment on Jetson Orin", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is shadow-deploying the INT8 candidate on Jetson Orin feasible, and how would you partition production and candidate workloads?", "chain_ids": ["edge-chain-auto-026-11"], "chain_positions": {"edge-chain-auto-026-11": 2}, "chain_tiers": {"edge-chain-auto-026-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1751", "title": "Optimizing ViT Operator Fallback on Cloud AI 100", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Would you rewrite the ViT to use Cloud AI 100 supported operators or write custom kernels for unsupported attention ops, and why?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1752", "title": "Multi-Model Serving Strategy on Hailo-8 Accelerator", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the Hailo-8 detector and feature extractor be temporally multiplexed or co-compiled into one spatial context, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1753", "title": "Sizing Object Detection for Hailo-8 Dataflow Limits", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can one Hailo-8 run the 25M-parameter, 50 GOps model for four 1080p 30 FPS feeds, and what is the primary bottleneck?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 3}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1754", "title": "Sizing Telemetry for Coral Edge TPU Fleet", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What telemetry architecture, metrics, cadence, and central ingestion sizing would you use for 5,000 nodes over 50 Kbps links?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1755", "title": "Hailo-8 Dataflow Scheduling for High-Resolution Detection", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule and fuse the YOLOv8 layers to maintain 30 FPS without making PCIe bandwidth the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1756", "title": "Camera Stream Sizing for Cloud AI 100", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many 30 FPS streams can the Cloud AI 100 support for a 2.5 TOPS-per-frame model under the 60W chassis power cap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1757", "title": "Privacy Guardrail Pipeline on Dataflow Edge", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do you design this dual-model pipeline and size the required host-accelerator memory bandwidth given the hardware constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1758", "title": "Edge TPU Storage Format and I/O Throughput Sizing", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What storage format should the Coral pipeline use for hard negatives on eMMC, and how would you size I/O to keep the Edge TPU fed?", "chain_ids": ["edge-chain-auto-secondary-008-32"], "chain_positions": {"edge-chain-auto-secondary-008-32": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1759", "title": "Multi-Camera Streaming Edge Sizing", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the system handle 20 uncompressed 1080p60 camera streams for a 15 GOPS/frame feature extractor, and where is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1760", "title": "Optimizing ViT Attention Dataflow on Systolic Edge NPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should QxK^T use weight-stationary or output-stationary dataflow, and how would you tile it to minimize memory access on a systolic edge NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1761", "title": "Edge TPU Activation and Weight Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you size the CNN weights, activations, and intermediate tensors so the Coral Edge TPU keeps the detector in one INT8 subgraph?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1762", "title": "Optimizing Object Detection Graph for Coral Edge TPU", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate the tradeoff between modifying the graph to use natively supported INT8 operators versus pipelining the host-side execution. Should you modify the SSD graph or pipeline the fallback?", "chain_ids": ["edge-chain-auto-secondary-006-28"], "chain_positions": {"edge-chain-auto-secondary-006-28": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1763", "title": "Hailo-8 End-to-End Latency Decomposition", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you break down the 15 ms latency across preprocessing, PCIe transfers, accelerator compute, and postprocessing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1764", "title": "Hardware-Aware NAS for Jetson Orin DLA", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you constrain the NAS search space and cost model so the Orin DLA-only perception model is compatible and meets latency targets?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1765", "title": "Pruning Tradeoffs for Dataflow Edge Accelerators", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "For the 4-camera Hailo-8 detector, would you use unstructured pruning or structured channel pruning to fit PCIe and 2.5W limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1766", "title": "Hailo-8 INT8 Quantization Strategy for Dataflow Streaming", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the system-level impact of choosing per-channel versus per-tensor INT8 quantization on the streaming architecture's throughput?", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 3}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1767", "title": "Roofline Optimization on Qualcomm Cloud AI 100", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using roofline analysis on Cloud AI 100, is the 150 GOPS ViT memory-bound or compute-bound, and should you prioritize pruning or batching?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 4}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1768", "title": "Sizing Llama-3 8B KV Cache for AI 100", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you calculate the maximum batch size and context length for an 8B INT8 LLM on 32 GB Cloud AI 100 and improve utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1769", "title": "Shadow Deployment Quantization Requirement on Coral TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "To successfully compile the new candidate model and deploy it to receive duplicated live traffic, what specific numerical precision format must the model be fully converted to?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1770", "title": "Google Coral Edge TPU Activation Data Type Requirement", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What data format must all activations and weights use before compiling the FP32-trained model for the 2W Coral Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-003-20"], "chain_positions": {"edge-chain-auto-secondary-003-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1771", "title": "Identifying Power Side-Channel Attacks on Edge Accelerators", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a power side-channel attack, and what physical property of the Cloud AI 100 would an attacker monitor?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1772", "title": "Hailo-8 Host-Side Data Stream Validation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "In this architecture, where must data quality checks and schema validation occur, and what happens if corrupted data is streamed to the device?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1773", "title": "Recall KL Divergence for Edge Drift", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the mathematical definition of Kullback-Leibler (KL) divergence, and is it a symmetric metric?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1774", "title": "Define Fail-Operational vs Fail-Safe on Cloud AI 100", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the exact difference between fail-safe and fail-operational states in this Cloud AI 100 edge deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1775", "title": "Jetson Orin DLA Offloading for Distilled Models", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the specific name of the dedicated fixed-function hardware accelerator on the Jetson Orin designed for this type of inference?", "chain_ids": ["edge-chain-auto-secondary-008-22"], "chain_positions": {"edge-chain-auto-secondary-008-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1776", "title": "Assigning Jetson Orin Model Instances to GPU and DLA", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which NVIDIA configuration lets you place inference model instances on either the Jetson Orin GPU or DLA?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1777", "title": "Hailo-8 Memory Architecture and Power Efficiency", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental memory architecture of the Hailo-8 dataflow accelerator, and what is its rated INT8 performance and power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1778", "title": "Jetson Orin Peak INT8 Performance Recall", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "To set expectations for the inference latency of your quantized model, what is the theoretical peak INT8 compute performance of the NVIDIA Jetson Orin, and what hardware components contribute to this?", "chain_ids": ["edge-chain-auto-secondary-011-30"], "chain_positions": {"edge-chain-auto-secondary-011-30": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1779", "title": "Qualcomm Cloud AI 100 Compute and Memory Recall", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the maximum INT8 compute throughput and total memory capacity of this specific accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1780", "title": "Hailo-8 Local Memory Architecture Recall", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the capacity of the onboard local DRAM on a Hailo-8 chip available for storing model weights and activations?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1781", "title": "Recall NVIDIA Jetson Orin DLA Purpose", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does DLA stand for, and what type of operations is it primarily optimized for in this architecture?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1782", "title": "Hailo-8 Dataflow Memory Constraints for Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific hardware memory characteristic of the dataflow accelerator dictates this spatial operator scheduling approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1783", "title": "Jetson Orin DLA Identification for Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific, separate hardware accelerator on the Jetson Orin is likely processing the INT8 inference workload?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1784", "title": "Sparsity Support on Google Coral Edge TPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Does 75% unstructured weight pruning reduce inference latency on the Edge TPU, and what pruning approach would actually help?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 0}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1785", "title": "Hailo-8 Hardware Specs for Model Cards", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the peak compute throughput, required precision, and power consumption of the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1786", "title": "Edge Inference Logging Storage Format", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What binary, row-oriented storage format native to the TensorFlow ecosystem is designed for efficiently logging sequential, append-only inference results to local storage?", "chain_ids": ["edge-chain-auto-secondary-008-32"], "chain_positions": {"edge-chain-auto-secondary-008-32": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1787", "title": "Qualcomm Cloud AI 100 Memory Capacity Recall", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the exact capacity and memory type of the Qualcomm Cloud AI 100's on-board memory for sizing ring buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1788", "title": "NVIDIA DLA Primary Compute Architecture on Jetson Orin", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific type of hardware architecture does the DLA use at its core to compute dense matrix multiplications?", "chain_ids": ["edge-chain-auto-024-07"], "chain_positions": {"edge-chain-auto-024-07": 0}, "chain_tiers": {"edge-chain-auto-024-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1789", "title": "Hailo-8 Power and Performance Specification Recall", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the typical power consumption and peak INT8 performance of the Hailo-8 under load?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1790", "title": "Google Coral Edge TPU Precision Requirement", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What quantization requirement must the model meet to run on the Coral Edge TPU instead of falling back to the CPU?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1791", "title": "Qualcomm Cloud AI 100 Physical Memory Capacity Recall", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total memory capacity and specific memory technology used on the Qualcomm Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1792", "title": "Qualcomm Cloud AI 100 Toolchain and Specs Recall", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the name of the primary software SDK and compiler toolchain provided by Qualcomm used to convert and optimize standard ONNX models for this specific hardware?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1793", "title": "Google Coral Edge TPU Supported Data Type", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What specific data type must all model weights and activations be quantized to for successful execution on the Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1794", "title": "Shadow Deployment Design on Qualcomm Cloud AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you shadow the new model on live 60 FPS streams while preserving primary latency and fitting within edge resource constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1795", "title": "On-Premise LLM Fine-Tuning Memory Specification", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What activation memory strategy would let the 7B fine-tuning run within 32 GB by trading Cloud AI 100 compute for memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1796", "title": "Adversarial Patch Defense Spec on Jetson Orin", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an adversarial-patch defense for 4K 30 FPS on Jetson Orin while sharing the GPU, DLA, memory, and 275 TOPS budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1797", "title": "Continuous Batching for Multi-Camera Object Detection", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you batch 8 asynchronous 30 FPS camera streams to maximize throughput while keeping per-frame latency under 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1798", "title": "Multi-Stream Video Analytics on Qualcomm Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you modify EfficientNet-lite and batching on the Cloud AI 100 to maximize 30 FPS stream density without becoming memory-bandwidth bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1799", "title": "Dual-Model Security Pipeline on Google Coral Edge TPU", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you orchestrate the detector, embedding model, INT8 quantization, and host-device routing to meet 15 FPS on a 4 TOPS INT8, 2W accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1800", "title": "Jetson Orin Fleet Compute Cost", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can one Jetson Orin support two 150 TOPS video streams, and what is the daily electricity cost for 1,000 devices at 60W and $0.15/kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1801", "title": "Edge TPU Visual Inspection Data Quality Pipeline", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design an edge-only data validation gate for noisy Coral Edge TPU camera frames within the 2W INT8 constraints?", "chain_ids": ["edge-chain-auto-secondary-009-23"], "chain_positions": {"edge-chain-auto-secondary-009-23": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1802", "title": "Data Curation for Hailo-8 INT8 Quantization", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate an INT8 calibration dataset for Hailo-8 so activation outliers are captured and quantization accuracy is preserved?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1803", "title": "Hailo-8 Traffic Camera Drift Detection Specification", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you detect seasonal data drift at the edge on Hailo-8 without adding host memory bandwidth pressure or disrupting inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1804", "title": "Hailo-8 Multi-Stream Zero-Copy DMA Pipeline Design", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build the host-to-Hailo-8 memory pipeline for four 1080p30 streams to avoid frame drops and minimize CPU copies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1805", "title": "Energy-Aware Inference Architecture on Jetson Orin", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should the data flow and operator selection be designed to stay within the 60W power budget while maximizing compute utilization?", "chain_ids": ["edge-chain-auto-secondary-005-12"], "chain_positions": {"edge-chain-auto-secondary-005-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-005-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1806", "title": "On-Premise Intersectional Fairness Specification for Diagnostic Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify the architectural requirements to track and compute these fairness metrics across various demographic subgroups while co-locating with the primary diagnostic workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1807", "title": "Inference Compute Specification for Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you balance batch size and operational intensity to maximize the INT8 compute utilization without hitting the LPDDR4x memory bandwidth wall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1808", "title": "Degradation Ladder for Autonomous Edge Perception", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you specify a fail-operational degradation ladder for Jetson Orin when thermal throttling caps power at 15W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1809", "title": "AOT Compilation Strategy for Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design AOT graph compilation and partitioning so unsupported ops do not cripple Cloud AI 100 INT8 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1810", "title": "Knowledge Distillation for Hailo-8 Dataflow", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the student model and distillation process given its INT8 dataflow execution and no local DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1811", "title": "Hailo-8 Multi-Camera Stream Load Balancing", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route 8 camera streams across 4 Hailo-8 accelerators while minimizing host memory contention and avoiding accelerator starvation?", "chain_ids": ["edge-chain-auto-secondary-006-09"], "chain_positions": {"edge-chain-auto-secondary-006-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1812", "title": "Multi-Process Inference Optimization on Google Coral Edge TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you load one MobileNetV2 Edge TPU model across four isolated processes while minimizing RAM use and cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1813", "title": "Design Memory Management for Multi-Model Pipeline on Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a memory management specification to handle peak pressure, prevent OOMs, and minimize fragmentation for concurrent GPU/DLA inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1814", "title": "INT8 Dataflow Deployment for a DRAM-Less Accelerator", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you convert and compile the FP32 detector into a fully INT8 dataflow graph without host-memory graph breaks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1815", "title": "Jetson Orin Multi-Model TensorRT Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert and delegate the detector and segmentation models on Jetson Orin to maximize INT8 performance under 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1816", "title": "Vision Model Memory Profiling for Google Coral Edge TPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What end-to-end quantization, compilation, and memory analysis pipeline would you specify for deploying MobileNetV2-SSD on Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1817", "title": "Dataflow NAS Constraints for Hailo-8 Accelerator", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the NAS cost model and search space account for Hailo-8's no-DRAM dataflow architecture and host-memory bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1818", "title": "Edge TPU Layer Fusion and Operator Scheduling for INT8 Vision Model", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule and fuse operators on the Coral Edge TPU to keep intermediate tensors in SRAM and avoid USB CPU fallbacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1819", "title": "Hailo-8 Stream Profiling and Latency Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What comprehensive profiling architecture would you specify to determine whether Hailo-8 latency comes from preprocessing, PCIe bandwidth, or dataflow stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1820", "title": "INT8 Quantization Strategy for Google Coral Deployment", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fully quantize and refactor the MobileNetV2 model so it runs entirely on the Coral Edge TPU without CPU fallbacks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1821", "title": "On-Device Privacy Guardrails for Intersection Monitoring", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you integrate on-device face blurring and bias-audit logging while keeping the Coral Edge TPU pipeline fully INT8-compatible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1822", "title": "Fail-Safe Industrial Anomaly Detection on Edge TPU", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design deterministic Coral Edge TPU inference with watchdogs and safety guarantees despite limited operator support?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1823", "title": "Factory Defect Detection Storage Pipeline", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you format, compress, store, and stream 100 8MB images per second so the accelerator is not starved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1824", "title": "Real-Time Sensor Ingestion Pipeline for Jetson Orin", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the camera and LiDAR ingestion pipeline on Jetson Orin to meet a 30ms latency budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1825", "title": "Design a Dataflow Accelerator Pipeline for ResNet-50 on Hailo-8", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which dataflow strategy would you use for ResNet-50 layers on Hailo-8 at 120 FPS, and how would you buffer weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1826", "title": "Thermal Specification for Edge TPU Burst Workloads", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What thermal management specification would prevent Coral Edge TPU throttling during 10-second 2W bursts at 45°C ambient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1827", "title": "Hailo-8 Transformer KV-Cache Streaming Architecture Specification", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you partition a 125M-parameter transformer on Hailo-8 given O(n²) attention and host-streamed KV-cache bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1828", "title": "Hybrid Compute Partitioning for Coral Edge TPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you partition the CNN detector and dynamic tracker between the Coral Edge TPU and host CPU under a 5W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1829", "title": "Data Pruning for Edge TPU Object Detection", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you select a 500k-image coreset from 5 million images to maximize accuracy after INT8 deployment on the Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1830", "title": "Designing Kernel Fusion for ViTs on Jetson Orin", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What kernel fusion strategy would you use for a ViT on Jetson Orin to reduce LayerNorm, GELU, and launch overhead within 60W?", "chain_ids": ["edge-chain-auto-025-14"], "chain_positions": {"edge-chain-auto-025-14": 1}, "chain_tiers": {"edge-chain-auto-025-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1831", "title": "Latency Budgeting for Defect Detection on Edge TPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you allocate the 25ms camera-to-reject-arm latency budget for a Coral Edge TPU defect detection pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1832", "title": "Qualcomm Cloud AI 100 Multi-Model Serving Architecture", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design model serving for four ResNet-50 variants and 16 camera streams on one Qualcomm Cloud AI 100 to maximize utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1833", "title": "Observability for On-Premise AI 100 Accelerators", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you design the observability stack to ensure high reliability without overwhelming the factory's limited external network bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1834", "title": "Roofline Analysis for MobileNetV2 on Qualcomm Cloud AI 100", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use roofline analysis to diagnose if MobileNetV2 is compute-bound or memory-bound on the Qualcomm Cloud AI 100?", "chain_ids": ["edge-chain-auto-018-09"], "chain_positions": {"edge-chain-auto-018-09": 3}, "chain_tiers": {"edge-chain-auto-018-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1835", "title": "Optimizing INT8 Matrix Multiplication on an Edge Accelerator", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize the underperforming INT8 GEMM on an edge accelerator for Tensor Core utilization, coalescing, and occupancy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1836", "title": "Diagnosing Inefficient Neural Network Inference on Hailo-8", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of the performance bottleneck, considering the Hailo-8's unique architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1837", "title": "Optimizing INT8 Matrix Multiplication on Qualcomm Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze and optimize the INT8 GEMM kernel on Cloud AI 100 for memory coalescing, occupancy, and INT8 throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1838", "title": "Optimizing Transformer Inference on Qualcomm Cloud AI 100", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize transformer attention GEMMs on Cloud AI 100 using data layout, tiling, on-chip buffers, and INT8 units?", "chain_ids": ["edge-chain-auto-secondary-007-23"], "chain_positions": {"edge-chain-auto-secondary-007-23": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1839", "title": "Evaluating Qualcomm Cloud AI 100 for Edge ML Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare Cloud AI 100, Jetson Orin, and Intel Atom for smart-camera efficiency, programmability, and TCO?", "chain_ids": ["edge-chain-auto-secondary-008-14"], "chain_positions": {"edge-chain-auto-secondary-008-14": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1840", "title": "Edge AI Deployment: Selecting the Optimal Accelerator for Real-time Object Detection with Hailo-8", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare the Hailo-8 against a low-power ARM CPU with integrated NPU and a small embedded GPU, and what trade-offs justify your choice?", "chain_ids": ["edge-chain-auto-secondary-008-13"], "chain_positions": {"edge-chain-auto-secondary-008-13": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1841", "title": "Jetson Orin: Edge Accelerator Trade-offs for Object Detection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you justify your choice, emphasizing the trade-offs across programmability, inference throughput, power budget, and memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1842", "title": "Optimizing Edge ML Inference on Google Coral Edge TPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks likely cause Coral Edge TPU latency over 50ms, and what quantifiable fixes would meet the power and performance limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1843", "title": "Quantized Conv Layer Performance on Google Coral Edge TPU", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a perfectly efficient weight-stationary systolic array, what is the theoretical minimum layer time in milliseconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1844", "title": "Transformer Attention Dataflow on an Edge Systolic Accelerator", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What could be the root cause of this underutilization and increased latency, specifically considering the characteristics of transformer layers and the accelerator's architecture?", "chain_ids": ["edge-chain-auto-024-09"], "chain_positions": {"edge-chain-auto-024-09": 1}, "chain_tiers": {"edge-chain-auto-024-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1845", "title": "1x1 Convolution SRAM Dataflow for Low Latency", "topic": "systolic-dataflow", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should on-chip SRAM and weight-stationary versus output-stationary dataflows minimize latency for the 1x1 convolution?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1846", "title": "Edge TPU Object Detection Cost Analysis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the annual electricity cost for 100 Coral Edge TPU factory deployments running 24/7 with a 5W host and 75% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1847", "title": "Fleet Perception Accelerator Hours and Annual Cost", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you estimate annual INT8 operations, accelerator-hours, and total dollar cost for 100,000 equipped vehicles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1848", "title": "Edge TPU Inference Cost and Scalability", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How many Coral Edge TPUs are required per 30 FPS camera, and what power and operational cost implications follow?", "chain_ids": ["edge-chain-auto-026-15"], "chain_positions": {"edge-chain-auto-026-15": 2}, "chain_tiers": {"edge-chain-auto-026-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1849", "title": "Edge LLM Inference Cost on Qualcomm Cloud AI 100", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What maximum requests per second and cost per million inferences can one accelerator with 400 TOPS INT8 deliver at 80% of peak on this LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1850", "title": "Edge Model Evaluation: Performance and Cost on NVIDIA Jetson Orin", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What FPS and annual electricity cost do Models Alpha and Beta achieve on Jetson Orin, and which would you recommend?", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 3}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1851", "title": "Edge AI Costing: Large-Scale Vision Deployment on Hailo-8", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the per-device FPS, fleet energy use, 5-year energy cost, and hidden deployment costs for 10,000 Hailo-8 devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1852", "title": "Edge AI Compute Cost and Performance Estimation for Google Coral TPU Deployment", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the maximum theoretical inferences per second (FPS) per device, and what is the annual electricity cost for the 1000-device fleet?", "chain_ids": ["edge-chain-auto-026-15"], "chain_positions": {"edge-chain-auto-026-15": 1}, "chain_tiers": {"edge-chain-auto-026-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1853", "title": "Jetson Orin LLM Inference VRAM Budgeting Components", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory beyond the 7B BF16 weights must fit in Jetson Orin's 32 GB LPDDR5, and what formulas estimate each footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1854", "title": "VRAM Budgeting for a Quantized Vision Transformer on Hailo-8", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can this 5M-parameter INT8 vision transformer fit in the 8 MB SRAM, and how would you budget and optimize weights, activations, and KV-cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1855", "title": "VRAM Budgeting for LLM Inference on Qualcomm Cloud AI 100", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture fits the Cloud AI 100's 32 GB VRAM for 2048-token inference, and how would AdamW fine-tuning change the budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1856", "title": "Hailo-8 SRAM Budgeting for Automotive Vision Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you budget weights, feature maps, and runtime buffers on Hailo-8 to stop OOMs and latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1857", "title": "Memory Hierarchy Tradeoffs for LLM Deployment on an Edge Accelerator", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do LPDDR4x capacity, bandwidth, latency, and the 75W power limit constrain real INT8 throughput for this large LLM?", "chain_ids": ["edge-chain-auto-024-03"], "chain_positions": {"edge-chain-auto-024-03": 1}, "chain_tiers": {"edge-chain-auto-024-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1858", "title": "Optimizing Large Language Model Deployment on NVIDIA Jetson Orin", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you manage the 20 GB INT8 model and activations on Jetson Orin to meet sub-100 ms per-token latency within 32 GB LPDDR5 and 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1859", "title": "Hailo-8 Memory Bandwidth for Convolution Layer", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For the 3x3 convolution, how many MACs are required, what input-activation bandwidth is needed in 1 ms with 4x reuse, and is it memory-bound versus a 68 GB/s bandwidth limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1861", "title": "Optimizing Large Language Model Deployment on NVIDIA Jetson Orin with Activation Memory Constraints", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What quantization and runtime-memory strategy lets the 10B LLM run on Jetson Orin within 32 GB LPDDR5 and 60W?", "chain_ids": ["edge-chain-auto-secondary-003-21"], "chain_positions": {"edge-chain-auto-secondary-003-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1862", "title": "Coral Edge TPU: Activation Memory & Compute-Memory Tradeoff for Real-time Segmentation", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which model do you recommend and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1863", "title": "Jetson Orin LLM Deployment: Activation Memory Bottleneck & Checkpointing", "topic": "activation-memory", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose Jetson Orin OOMs from forward activations in the 7B INT8 transformer and reduce peak memory without killing throughput?", "chain_ids": ["edge-chain-auto-secondary-003-21"], "chain_positions": {"edge-chain-auto-secondary-003-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1864", "title": "Hailo-8 Memory-Mapped Weight Loading Strategies for Shared Inference", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should mmap be used so multiple Hailo-8 inference processes share model weights and avoid cold-start latency?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1865", "title": "Memory-Mapped Inference on Google Coral Edge TPU: Cold Start Analysis", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes Coral cold-start latency with memory-mapped weights, and what strategy would reduce page faults and sustained storage stalls?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1866", "title": "Optimizing Memory-Mapped Model Loading on Jetson Orin", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you mmap the 20 GB LLM weights to minimize cold start while preserving enough 32 GB LPDDR5 for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1867", "title": "Optimizing Memory-Mapped Inference on Hailo-8 for Edge AI", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use mmap for 500 MB model weights to achieve fast boot or model swaps, avoid page-fault pitfalls, and share weights across processes?", "chain_ids": ["edge-chain-auto-secondary-008-24"], "chain_positions": {"edge-chain-auto-secondary-008-24": 3}, "chain_tiers": {"edge-chain-auto-secondary-008-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1868", "title": "Optimizing Memory-Mapped Inference on Google Coral Edge TPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and reduce the several-hundred-ms cold start for a 50 MB memory-mapped Coral TFLite model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1869", "title": "Optimizing Memory-Mapped Large Model Inference on Qualcomm Cloud AI 100", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design mmap-based inference for a 70B INT8 model that exceeds Cloud AI 100's 32 GB, while minimizing cold start and sharing weights?", "chain_ids": ["edge-chain-auto-secondary-008-26"], "chain_positions": {"edge-chain-auto-secondary-008-26": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1870", "title": "Optimizing Data Movement on NVIDIA Jetson Orin", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are 1080p frame transfers slow from pageable camera buffers on Jetson Orin, and which pinned or zero-copy techniques would fix them?", "chain_ids": ["edge-chain-auto-secondary-008-15"], "chain_positions": {"edge-chain-auto-secondary-008-15": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1871", "title": "Optimize Data Movement for Edge TPU Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize host-to-Edge TPU data movement for the 512x512x3 INT8 detector when transfers consume 30% of latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1872", "title": "Qualcomm Cloud AI 100 Data Movement Bottleneck Diagnosis", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose host-device data movement overhead on Cloud AI 100, and what symptoms would indicate inefficient LPDDR4x or DMA use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1873", "title": "Real-time 4K Video Object Detection Data Movement Strategy", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What data movement strategy would you use on Jetson Orin to process 4K30 with CPU pre/postprocessing and INT8 GPU inference within 60W?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1874", "title": "Edge Inference Data Movement: Coral vs. CPU/GPU", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do the integrated-GPU SoC and attached Edge TPU differ in DMA, zero-copy, and host-device data movement for high-resolution video inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1875", "title": "Qualcomm Cloud AI 100: Optimizing GenAI Data Movement", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design host-to-Cloud AI 100 tensor transfers for large generative inference to maximize throughput and minimize energy?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1876", "title": "Optimizing Data Movement for Edge AI on Jetson Orin", "topic": "dma-data-movement", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and optimize the 15ms Jetson Orin data-movement bottleneck using pinned memory, async copies, or zero-copy mapping?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1877", "title": "Designing for Memory Pressure on Qualcomm Cloud AI 100", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you manage memory pressure for a 28 GB LLM on a 32 GB accelerator to prevent OOMs under real-time peak load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1878", "title": "Memory-Efficient LLM Fine-tuning on NVIDIA Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you avoid OOM during Jetson Orin fine-tuning, what is the max micro-batch size, and how does gradient accumulation set effective batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1879", "title": "15B INT8 LLM OOM on a 32 GB Edge Accelerator", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can a 15B INT8 LLM still OOM on a 32 GB device, and what memory-pressure strategies and OS effects would you consider?", "chain_ids": ["edge-chain-auto-secondary-011-09"], "chain_positions": {"edge-chain-auto-secondary-011-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1880", "title": "Memory Optimization for Large Language Model Deployment on NVIDIA Jetson Orin", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which Jetson Orin deployment architecture is more stable under memory pressure, and how would you mitigate OOMs, fragmentation, and future fine-tuning needs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1881", "title": "Edge LLM Memory Optimization for Real-time Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize memory and handle OOMs for an INT8 edge LLM when activations exceed on-chip memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1882", "title": "Edge TPU Memory Optimization for OOM Prevention", "topic": "memory-pressure-management", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and fix sporadic Coral Edge TPU OOMs from peak activations or fragmentation while maintaining 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1883", "title": "Optimizing Object Detection Latency on Google Coral Edge TPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you break down the 150 ms Coral Edge TPU frame latency to find whether capture, preprocessing, USB transfer, inference, or postprocessing dominates?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1884", "title": "Decomposing End-to-End Latency on Qualcomm Cloud AI 100 for Real-time Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose the 75ms end-to-end latency to identify the bottlenecks preventing you from meeting the 50ms target?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1885", "title": "Latency Decomposition for Real-time Inference on NVIDIA Jetson Orin", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose the 100 ms object-detection latency budget into acquisition, preprocessing, inference, postprocessing, and output?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1886", "title": "Edge TPU Latency Decomposition for Real-time Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you allocate and measure the Coral Edge TPU drone pipeline's 50 ms latency budget across preprocessing, transfers, inference, and control?", "chain_ids": ["edge-chain-auto-secondary-001-04"], "chain_positions": {"edge-chain-auto-secondary-001-04": 3}, "chain_tiers": {"edge-chain-auto-secondary-001-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1887", "title": "Optimizing Real-time Object Detection Latency on Hailo-8 for Edge AI", "topic": "latency-decomposition", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose and optimize the Hailo-8 drone detection pipeline to reduce end-to-end latency from 150ms to the 100ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1888", "title": "Optimizing Object Detection on Jetson Orin: Dynamic Batching for Throughput and Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which batching strategy and scheduling policy would you use for 4K30 object detection to maximize throughput while keeping latency under 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1889", "title": "Optimizing Object Detection Latency and Throughput on Hailo-8 with Advanced Batching Strategies", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use static, dynamic, and continuous batching to balance 50 ms critical-event latency with maximum multi-stream throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1890", "title": "Diagnosing Latency Spikes with Batching on Qualcomm Cloud AI 100", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the static batch size of 4 causing tail latency spikes, and how would dynamic or continuous batching reduce them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1891", "title": "Optimizing Real-time Inference on NVIDIA Jetson Orin with Adaptive Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What batching and scheduling policy would you use to keep 90% of inferences under 50 ms while maximizing throughput?", "chain_ids": ["edge-chain-auto-secondary-005-09"], "chain_positions": {"edge-chain-auto-secondary-005-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-005-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1892", "title": "Jetson Orin: Latency vs. Throughput in Edge Inference Batching", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you batch and schedule multi-camera YOLOv7 to maximize effective FPS while keeping every frame under 100ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1893", "title": "Real-Time Object Detection on Hailo-8: Analyzing Jank and ANR", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "As a senior ML Systems Engineer, how would you systematically analyze the root causes of these real-time performance violations given the stated constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1894", "title": "Diagnosing Latency Spikes on Qualcomm Cloud AI 100", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which profiling tools and traces would you use on the Qualcomm Cloud AI 100 to determine whether the latency spikes are compute, memory, or I/O bound?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1895", "title": "Optimizing Real-time Inference Latency on Jetson Orin", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the Jetson Orin pipeline to find whether the 150 ms inference latency is compute-, memory-, I/O-, or power-throttling-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1896", "title": "Optimizing Latency on Hailo-8: Profiling for Edge ML Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the accelerator and host pipeline to pinpoint the compute, memory, or I/O bottlenecks causing 50 ms latency violations?", "chain_ids": ["edge-chain-auto-secondary-010-01"], "chain_positions": {"edge-chain-auto-secondary-010-01": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1897", "title": "Optimizing Real-time Object Detection Latency on Google Coral Edge TPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the Coral Edge TPU pipeline and quantify why 30 ms TPU inference becomes over 80 ms end-to-end latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1898", "title": "Diagnosing Latency on Qualcomm Cloud AI 100 with Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use profiling and trace tools on the Qualcomm Cloud AI 100 to determine whether the 50 ms LLM latency miss is compute, memory, or I/O bound?", "chain_ids": ["edge-chain-auto-secondary-010-03"], "chain_positions": {"edge-chain-auto-secondary-010-03": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1899", "title": "Hailo-8 Edge Latency Bottleneck for Object Detection", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you identify whether compute, memory, or I/O is causing dropped frames on the Hailo-8 at 30 FPS, and what first mitigations would you try?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1900", "title": "Optimizing Latency on Edge Accelerators for LLM Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you systematically find and fix the bottlenecks causing latency spikes beyond the 50 ms budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1901", "title": "Real-time Object Detection Latency Optimization on NVIDIA Jetson Orin", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and quantify the bottlenecks causing 1080p/30 FPS latency spikes above 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1902", "title": "Jetson Orin INT8 Quantization: Performance vs. Accuracy Trade-offs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you recover mAP and reduce false negatives on small objects after INT8 PTQ of YOLOv7?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1903", "title": "Quantizing a Large Language Model for Qualcomm Cloud AI 100 Deployment", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you quantize and deploy the FP16 LLM on the Qualcomm Cloud AI 100 to maximize INT8/INT4 performance while preserving accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1904", "title": "Designing a Mixed-Precision Strategy for Edge Deployment on Hailo-8", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What INT8 quantization and deployment strategy would you use on Hailo-8 to maximize throughput and power efficiency within 1% of FP32 accuracy?", "chain_ids": ["edge-chain-auto-secondary-011-29"], "chain_positions": {"edge-chain-auto-secondary-011-29": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1905", "title": "Quantization Strategy for Coral Edge TPU Deployment", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantize and validate FP32 ResNet-50 for the INT8-only Coral Edge TPU to hit 10 ms latency with under a 2-point accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1906", "title": "Optimizing Mixed-Precision Inference on Hailo-8", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you adapt the FP32 LLM for mixed-precision inference on an INT8-optimized edge accelerator while evaluating accuracy, latency, and power tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1907", "title": "Optimizing Edge Deployment: Mixed-Precision Training for Coral Edge TPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you compare Model A and Model B for INT8-only Coral deployment across accuracy, throughput, energy, and quantization risks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1908", "title": "Mixed-Precision Deployment for a 64 GB LLM on a 32 GB Edge Accelerator", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What mixed-precision deployment strategy fits the FP32 LLM within 32 GB memory while minimizing latency and preserving accuracy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1909", "title": "Optimizing Real-time Object Detection on NVIDIA Jetson Orin via Mixed-Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose bottlenecks and design a quantified mixed-precision plan for YOLOv8-L on Jetson Orin to reach 30 FPS without OOMs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1910", "title": "Extreme Quantization Strategy for Large Models on Google Coral Edge TPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is sub-4-bit deployment infeasible for the oversized model on Coral Edge TPU, and what fallback plan meets the latency and accuracy targets?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1911", "title": "Deploying Sub-4-bit LLMs on a 75W Edge Accelerator", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you deploy sub-4-bit weight quantization for the LLM while using INT8-friendly kernels and fallbacks?", "chain_ids": ["edge-chain-auto-secondary-006-03"], "chain_positions": {"edge-chain-auto-secondary-006-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1912", "title": "INT8 Vision Model Deployment on Hailo-8", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you quantize the object detector for Hailo-8 to achieve at least 85% mAP and 30 FPS under its 26 TOPS, 2.5W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1913", "title": "Extreme Quantization Deployment on Qualcomm Cloud AI 100", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you prepare, deploy, and validate the 2-bit AWQ LLM on the Cloud AI 100 given limited native sub-4-bit support?", "chain_ids": ["edge-chain-auto-secondary-006-03"], "chain_positions": {"edge-chain-auto-secondary-006-03": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1914", "title": "Extreme Quantization of Vision Transformer on Hailo-8", "topic": "extreme-quantization", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you deploy a compressed ViT on Hailo-8 while respecting its supported INT8 execution path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1915", "title": "Hailo-8 Edge Power Budgeting for Perception", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize Hailo-8 power so a 10 TOPS perception model sustains 30 FPS within the 5W module budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1916", "title": "Power Budgeting for Real-time Edge TPU Deployment", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you ensure a 200 GOPS INT8 model sustains 10 FPS on the 2W Coral Edge TPU, and what power trade-offs would you analyze?", "chain_ids": ["edge-chain-auto-secondary-002-12"], "chain_positions": {"edge-chain-auto-secondary-002-12": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1917", "title": "NVIDIA Jetson Orin Thermal Throttling: Sustained Performance Recall", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause for this performance degradation, and what immediate thermal management strategy should be applied?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1918", "title": "Hailo-8 Thermal Throttling Analysis", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the device dropping to 15 TOPS at 35°C, and how would you prevent it from throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1919", "title": "Quantifying Sustained Performance Degradation due to Thermal Throttling on Qualcomm Cloud AI 100", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "With power throttled from 75W to 60W, what sustained INT8 TOPS, percentage reduction, and general TOPS_effective formula do you get?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1920", "title": "Hailo-8 Thermal Design for Edge Deployment", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you ensure the Hailo-8 doesn't thermal throttle under the worst-case ambient conditions?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1921", "title": "Coral Edge TPU Thermal Throttling in High-Ambient Environments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign cooling and workload control so the Coral Edge TPU sustains performance at 45°C without latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1922", "title": "Sustained Inference Throttling Across Edge Ambient Temperatures", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare the Cloud AI 100 and Accelerator X for throttling-free sustained inference from 0°C to 45°C?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1923", "title": "Optimizing Sustained Performance on NVIDIA Jetson Orin in Challenging Environments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose, mitigate, and manage thermal throttling so the drone vision system sustains performance at 40°C?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 4}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1924", "title": "Hailo-8 Edge Deployment: Sustained Performance under Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and quantify a thermal solution for Hailo-8 throughput drops below 10 TOPS in 50°C industrial operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1925", "title": "Google Coral Edge TPU Thermal Throttling in Edge Deployments", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the most likely cause of the Coral Edge TPU throughput dropping 50% after 5-10 minutes at 40°C, and how would you mitigate it?", "chain_ids": ["edge-chain-auto-secondary-006-16"], "chain_positions": {"edge-chain-auto-secondary-006-16": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1926", "title": "Energy-Aware Model Design for Real-Time Inference on Google Coral Edge TPU", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Coral Edge TPU model for 30 FPS with minimal energy per inference, considering memory access is far costlier than INT8 MACs?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1927", "title": "Qualcomm Cloud AI 100: INT8 Matrix Multiply Energy Analysis", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Estimate the total energy consumed by compute operations versus memory accesses. Which component dominates the energy budget, and by how much?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1928", "title": "Energy-Aware Edge Selection: Jetson AGX Orin vs. Orin NX", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which platform is more energy-suitable for 1000 INT8 inferences/s with a 10 GB model, and how do compute, memory, and TDP trade off?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1929", "title": "Energy-Efficient LLM Deployment on NVIDIA Jetson Orin", "topic": "energy-per-operation", "competency_area": "power", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile and optimize Jetson Orin LLM inference energy when both INT8 matmuls and LPDDR5 embedding/KV-cache accesses affect latency?", "chain_ids": ["edge-chain-auto-secondary-005-12"], "chain_positions": {"edge-chain-auto-secondary-005-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-005-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1930", "title": "7B LLM Feasibility on NVIDIA Jetson Orin", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you deploy the 7B FP16/INT8 LLM on Jetson Orin to meet a 2048-token context and <100 ms per token latency?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1931", "title": "Reducing Transformer Attention Spikes on Edge Accelerators", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks cause 512-token latency and power spikes on the edge accelerator, and what quantified attention redesign would fix them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1932", "title": "MobileNet Architectural Benefits on Coral Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why are depthwise separable convolutions beneficial for CNNs on the INT8-only Coral Edge TPU with 4 TOPS and a 2W power budget?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 1}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1933", "title": "Optimizing MobileNetV3 for Edge Deployment on Qualcomm Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you optimize MobileNetV3-Large on the Cloud AI 100 to reduce latency spikes and stay within the 75W power envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1934", "title": "Optimizing MobileNetV3 on Hailo-8 for Real-time Edge Inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the 25ms latency on Hailo-8 and optimize MobileNetV3-Large to reach 10ms while preserving accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1935", "title": "Designing an Efficient MobileNet for Google Coral Edge TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and optimize MobileNetV2 for 50 FPS under 20ms and 2W on the INT8-only TPU?", "chain_ids": ["edge-chain-auto-secondary-002-07"], "chain_positions": {"edge-chain-auto-secondary-002-07": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1936", "title": "Jetson Orin MobileNetV2 vs EfficientNetB0 Deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which model, MobileNetV2 or EfficientNetB0, would you deploy on the Jetson Orin, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1937", "title": "Edge AI Deployment: Optimizing CNN for Qualcomm Cloud AI 100", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign ResNet-50 using MobileNet/EfficientNet-style blocks to hit 10ms, 90% mAP, and the 75W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1938", "title": "Edge Deployment: Model Sizing for NVIDIA Jetson Orin", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What steps would you take to estimate the model's memory footprint and computational load, and propose strategies to ensure successful deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1939", "title": "Hailo-8 Deployment: Model Memory Footprint & Throughput", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How much INT8 parameter memory does the 15M-parameter model need, what is its theoretical Hailo-8 throughput at 20 GOPS, and what do these imply?", "chain_ids": ["edge-chain-auto-secondary-003-26"], "chain_positions": {"edge-chain-auto-secondary-003-26": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1940", "title": "Coral Edge TPU Feasibility for MobileNetV3-Small: Memory and Performance Estimation", "topic": "model-size-estimation", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Based on a feasibility estimation of memory footprint and INT8 operations, can the Coral Edge TPU meet the 30 FPS requirement?", "chain_ids": ["edge-chain-auto-secondary-003-29"], "chain_positions": {"edge-chain-auto-secondary-003-29": 2}, "chain_tiers": {"edge-chain-auto-secondary-003-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1941", "title": "Hardware-aware NAS Constraints on Qualcomm Cloud AI 100", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the Cloud AI 100's LPDDR4x, 400 TOPS INT8, and 75W budget shape the NAS search space and objectives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1942", "title": "Hardware-aware NAS Deployment on NVIDIA Jetson Orin for Real-time Object Detection", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why might the general-purpose NAS model miss Jetson Orin latency and power targets, and how should hardware-aware NAS fix this?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1943", "title": "Hardware-aware NAS for Hailo-8: Latency and Memory Constraints", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you identify Hailo-8 layer bottlenecks and estimate whether a modified layer meets 5ms latency or 50KB feature-map memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1944", "title": "Hardware-aware NAS for Coral Edge TPU Deployment", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS strategy for Coral Edge TPU that enforces <30ms latency, <2W power, and INT8 constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1945", "title": "Hardware-Aware NAS for a Vision Model on Qualcomm Cloud AI 100", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS pipeline for the AI 100 to reduce the 150 GOPS, 12GB-activation baseline while keeping at least 88% mAP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1946", "title": "Hardware-aware NAS for Real-time Object Detection on NVIDIA Jetson Orin", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design hardware-aware NAS for Jetson Orin using SRAM, LPDDR5, INT8 FLOPs, and measured latency constraints?", "chain_ids": ["edge-chain-auto-secondary-007-07"], "chain_positions": {"edge-chain-auto-secondary-007-07": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1947", "title": "Hardware-Aware NAS for Edge TPU Deployment", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use hardware-aware NAS to design a 30 FPS object detector for the INT8-only Coral Edge TPU under its limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1948", "title": "Encoder-Only NLU Model for Google Coral Edge TPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture (encoder-only, decoder-only, or encoder-decoder) fits Coral Edge TPU NLU best under 4 TOPS INT8 and 2W, and why?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1949", "title": "Edge Deployment Tradeoffs: Encoder-Decoder Architectures on Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which NLP architecture would you choose on the edge for both classification and summarization, and how would you meet memory, latency, and power limits?", "chain_ids": ["edge-chain-auto-secondary-011-27"], "chain_positions": {"edge-chain-auto-secondary-011-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1950", "title": "Edge TPU Deployment: Encoder-Decoder Model Latency Diagnosis", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What architectural and deployment issues likely cause >500ms latency and thermal warnings for the INT8 encoder-decoder NLU model on Coral Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 2}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1951", "title": "INT8 Encoder Model for 100-Token Edge NLU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which LLM architecture would you choose on this platform for 100-token NLU under 100ms, and how do memory, INT8 compute, and power constraints affect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1952", "title": "Edge NLP Architecture Tradeoffs for Summarization on NVIDIA Jetson Orin", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare encoder-only, decoder-only, and encoder-decoder summarizers on Jetson Orin for memory, latency, and power, and choose one?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1953", "title": "Hailo-8 Edge Deployment: Encoder-Decoder Architecture Tradeoffs for NLP Classification", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For Hailo-8 text classification, how would you compare an INT8 MobileBERT-style encoder with a distilled GPT-2-style decoder, and which is preferable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1954", "title": "Encoder-Decoder Tradeoffs on Google Coral Edge TPU for Language Tasks", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How feasible are encoder-only, decoder-only, and encoder-decoder NLU models on the 4 TOPS, 2W, INT8-only Coral Edge TPU, and how would you make one deployable?", "chain_ids": ["edge-chain-auto-secondary-011-25"], "chain_positions": {"edge-chain-auto-secondary-011-25": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1955", "title": "Pruning Techniques and Hardware Alignment on NVIDIA Jetson Orin", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do structured and unstructured pruning differ for an LLM on an edge INT8 accelerator, and what accuracy-speedup tradeoffs do they create?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 0}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1956", "title": "Hailo-8 Pruning Strategy for Efficient Edge Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do pruning sparsity patterns matter on a dataflow architecture, and how would you trade accuracy, speedup, and power for structured vs unstructured pruning?", "chain_ids": ["edge-chain-auto-001-03"], "chain_positions": {"edge-chain-auto-001-03": 0}, "chain_tiers": {"edge-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1957", "title": "Optimizing Edge Vision Models: Structured Pruning for Coral TPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you leverage pruning and sparsity techniques, specifically considering the Coral Edge TPU's architecture and INT8-only support, to achieve the <50ms target without significant accuracy loss?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 3}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1958", "title": "Optimizing Sparse Transformer Inference on Qualcomm Cloud AI 100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you convert 90% unstructured sparsity into hardware-friendly sparsity on the AI 100 to maximize throughput while preserving 99% accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1959", "title": "Optimizing LLM Deployment on Hailo-8 via Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose pruning granularity and sparsity patterns to maximize accelerator utilization and minimize energy consumption?", "chain_ids": ["edge-chain-auto-001-03"], "chain_positions": {"edge-chain-auto-001-03": 1}, "chain_tiers": {"edge-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1960", "title": "Pruning MobileNetV2 for Google Coral Edge TPU Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you prune MobileNetV2 for the INT8-only Coral Edge TPU, and which sparsity type best improves speed and power with minimal accuracy loss?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 1}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1961", "title": "Optimizing Vision Model Latency with Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did 70% unstructured pruning fail on Jetson Orin, and what structured pruning pattern would you use to reach 30 FPS?", "chain_ids": ["edge-chain-auto-001-02"], "chain_positions": {"edge-chain-auto-001-02": 2}, "chain_tiers": {"edge-chain-auto-001-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1962", "title": "Optimizing LLM Deployment on Hailo-8 via Pruning and Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose structured or unstructured sparsity patterns for an INT8 LLM on Hailo-8, and evaluate accuracy, latency, throughput, and power?", "chain_ids": ["edge-chain-auto-001-03"], "chain_positions": {"edge-chain-auto-001-03": 2}, "chain_tiers": {"edge-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1963", "title": "Optimizing Large Language Models on Qualcomm AI 100: Knowledge Distillation Strategies", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use logit and feature distillation to build an INT8 student for Cloud AI 100, and when is distillation better than pruning?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1964", "title": "Optimizing Edge Inference with Knowledge Distillation on Hailo-8", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill a 90% mAP YOLOv5s teacher into a Hailo-8 student that achieves at least 88% mAP, <10ms latency, and lower power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1965", "title": "Knowledge Distillation for Low-Latency Object Detection on Google Coral Edge TPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the maximum INT8 MAC budget for 50ms on the 4 TOPS Coral Edge TPU, and how would distillation preserve accuracy within it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1966", "title": "Optimizing Large Language Models for Edge Deployment on NVIDIA Jetson Orin using Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you distill the 10B LLM into an INT8 student for the edge device that fits 32GB, stays under 60W, and reaches sub-20ms queries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1967", "title": "Knowledge Distillation on Google Coral Edge TPU: Teacher-Student Model Evaluation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you evaluate the distilled MobileNetV2 on Coral Edge TPU against the ResNet-50 teacher, including accuracy, latency, power, memory, and INT8 effects?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1968", "title": "Knowledge Distillation for Efficient LLM Deployment on Qualcomm Cloud AI 100", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use knowledge distillation to deploy an accurate INT8 student LLM on the Cloud AI 100 within 32GB LPDDR4x, 400 TOPS, and 75W limits?", "chain_ids": ["edge-chain-auto-secondary-008-20"], "chain_positions": {"edge-chain-auto-secondary-008-20": 3}, "chain_tiers": {"edge-chain-auto-secondary-008-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1969", "title": "Optimizing Vision Models for Hailo-8 with Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use logit and feature distillation to optimize a ResNet-50 detector for Hailo-8, and when is KD better than pruning?", "chain_ids": ["edge-chain-auto-secondary-008-21"], "chain_positions": {"edge-chain-auto-secondary-008-21": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1970", "title": "Optimizing Memory-Bound Operations on Hailo-8 via Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would kernel and operator fusion reduce latency and energy for sequential memory-bound INT8 ops on a dataflow accelerator?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 1}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1971", "title": "Optimizing a Vision Model with Operator Fusion on Google Coral Edge TPU", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse ReLU, BatchNorm, and Quantize after the 1x64x64x128 INT8 Conv2D output to reduce Coral TPU memory I/O and launch overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1972", "title": "Optimizing a Vision Model on Qualcomm Cloud AI 100 with Kernel Fusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you fuse the memory-bound ops before the 2D convolution on Cloud AI 100 to reduce LPDDR4x traffic, kernel launches, latency, and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1973", "title": "Optimizing Edge Inference on Jetson Orin: Kernel Fusion for Memory-Bound Operations", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Would you keep the Jetson Orin post-processing ops as separate CUDA kernels or fuse them into one custom kernel, and what latency benefit would you expect?", "chain_ids": ["edge-chain-auto-025-14"], "chain_positions": {"edge-chain-auto-025-14": 2}, "chain_tiers": {"edge-chain-auto-025-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1974", "title": "Hailo-8: Optimizing Memory-Bound Operations via Kernel Fusion for Edge Deployment", "topic": "kernel-fusion", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign this block to maximize throughput and energy efficiency, leveraging the Hailo-8's architectural characteristics?", "chain_ids": ["edge-chain-auto-025-13"], "chain_positions": {"edge-chain-auto-025-13": 3}, "chain_tiers": {"edge-chain-auto-025-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1975", "title": "AOT Optimization Bottlenecks on Qualcomm AI 100 for LLM Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might the INT8 AOT-compiled LLM underperform on Cloud AI 100, and how would you optimize the graph for its memory hierarchy and ISA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1976", "title": "Optimizing a Large Language Model for Edge Deployment on NVIDIA Jetson Orin", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the PyTorch LLM for efficient real-time execution on Jetson Orin using AOT compilation and graph-level compiler passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1977", "title": "Hailo-8 Inference Optimization with Constant Folding", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the maximum throughput increase for the folded Hailo-8 subgraph, and how much power would the 25% operation reduction save?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1978", "title": "Diagnosing Coral Edge TPU Graph Compilation & Performance Issues", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the INT8 TFLite model failing or falling back on Coral Edge TPU, and how would you fix the graph compilation pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1979", "title": "Optimizing Large Language Model Deployment on Qualcomm Cloud AI 100 via Ahead-of-Time Graph Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an AOT compilation strategy on the Cloud AI 100 to meet sub-10ms latency and 500 batch-1 inferences per second?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1980", "title": "Jetson Orin Deployment: Compiler Optimization for Real-time Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you investigate and improve the graph compilation pipeline to reduce 200ms/token latency and mitigate throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1981", "title": "Optimizing a 7B INT8 LLM for Qualcomm Cloud AI 100", "topic": "graph-compilation", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compile and optimize the 7B quantized LLM across operator lowering, constant folding, memory, and power?", "chain_ids": ["edge-chain-auto-secondary-006-27"], "chain_positions": {"edge-chain-auto-secondary-006-27": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1982", "title": "Optimizing Operator Scheduling on NVIDIA Jetson Orin", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What operator scheduling strategies should the developer use on Jetson Orin for memory reuse, parallel execution, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1983", "title": "Optimizing CNN Inference on Hailo-8 for Edge Deployment", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule CNN operators to hit 30 FPS while optimizing memory reuse, parallelism, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1984", "title": "Optimizing Conv Layer Memory on Qualcomm Cloud AI 100", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What peak intermediate memory is required under sequential execution versus ideal layer fusion for the two convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1985", "title": "Optimizing Deep Learning Inference on Jetson Orin: Diagnosing Performance Bottlenecks in Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why the Jetson Orin pipeline reaches only 15 FPS, focusing on execution order, memory reuse, and CPU-GPU transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1986", "title": "Optimizing MobileNetV3 for Hailo-8: Scheduling for Memory, Parallelism, and Fusion", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule MobileNetV3-Large operators on Hailo-8 to meet <5ms latency using memory reuse, parallel branches, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1987", "title": "Optimizing Multi-branch Vision Model Inference on Google Coral Edge TPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize operator execution order on the Coral Edge TPU for the multi-branch INT8 detector to reduce latency, memory, and energy?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1988", "title": "Edge Accelerator Scheduling for BERT-tiny Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate the impact of memory reuse, parallel execution, and layer fusion on BERT-tiny latency and throughput across the two accelerators?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1989", "title": "Optimizing Vision Transformer Execution on NVIDIA Jetson Orin", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize ViT operator scheduling on the edge device to meet a 30ms latency target while reducing memory use and staying within 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1990", "title": "Hailo-8 CNN Inference Optimization via Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and optimize Hailo-8 operator scheduling to reach 30 FPS using memory reuse, parallel execution, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1991", "title": "Load Balancing for Dynamic Qualcomm Cloud AI 100 Inference Workloads", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which load balancing algorithm best minimizes re-partitioning and cache invalidation during scaling events, and how does it work?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 0}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1992", "title": "Edge Inference Traffic Management on NVIDIA Jetson Orin", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design request routing for a Jetson Orin edge fleet to handle variable model loads, network latency, and the 60W power budget?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1993", "title": "Edge Inference Load Balancing with Hailo-8 Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route mixed critical and batch requests across Hailo-8 accelerators under failures or thermal throttling?", "chain_ids": ["edge-chain-auto-secondary-006-09"], "chain_positions": {"edge-chain-auto-secondary-006-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1994", "title": "Coral Edge TPU Fleet Sizing and Load Balancing for Real-time Inference", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many inferences per second can one Coral Edge TPU handle, how many TPUs are needed at 70% utilization for 800 IPS, and what total power is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1995", "title": "Edge Inference Load Balancing for Qualcomm Cloud AI 100 Fleet", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design request routing and load balancing for a Qualcomm Cloud AI 100 edge fleet serving diverse real-time LLM and CV workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1996", "title": "Edge ML Inference: Optimizing Load Balancing with Hailo-8 Accelerators", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which routing and load-balancing strategies would you compare for the 50-Hailo-8 cluster, and how would you handle spikes to 1000 RPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1997", "title": "Edge Inference Load Balancing with Coral Edge TPUs", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route and load balance inference frames across thousands of Edge TPUs under variable networks, failures, and INT8-only constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1998", "title": "Optimizing Edge Inference Routing on Qualcomm Cloud AI 100", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you replace round-robin routing for the Cloud AI 100 fleet to reduce latency spikes and uneven utilization in real-time object detection?", "chain_ids": ["edge-chain-auto-secondary-006-08"], "chain_positions": {"edge-chain-auto-secondary-006-08": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-1999", "title": "Edge AI Load Balancing & Routing for NVIDIA Jetson Orin Deployments", "topic": "load-balancing", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you route and shed drone inference requests on Jetson Orin while preserving tracker affinity, version safety, and 100ms latency?", "chain_ids": ["edge-chain-auto-secondary-006-06"], "chain_positions": {"edge-chain-auto-secondary-006-06": 3}, "chain_tiers": {"edge-chain-auto-secondary-006-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2000", "title": "Google Coral TPU Edge Deployment Strategy", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What should the Coral Edge TPU server do for INT8 model loading, lightweight serving, and concurrent requests to achieve low-latency inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2001", "title": "Reducing LLM Cold Start on Qualcomm Cloud AI 100", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you minimize cold start latency for a 20GB LLM on Qualcomm Cloud AI 100 when scaling to zero, while staying within memory and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2002", "title": "Jetson Orin Edge Model Memory Management for Autonomous Vehicles", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many unique secondary models can be pre-loaded while reserving RAM and TOPS for the primary and one active secondary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2003", "title": "Edge Object Detection with Hailo-8: Latency, Throughput, and Power Optimization", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the serving infrastructure and latency budget across data acquisition, pre/post-processing, and inference to meet 70ms end-to-end latency on a Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2004", "title": "Optimizing Real-time Edge Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a model serving strategy to meet the <50ms latency and 100 detections/sec throughput requirements while handling cold starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2005", "title": "Edge AI Model Serving on Hailo-8 with Dynamic Workloads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a Hailo-8 model serving strategy for multiple variable-load inspection models with sub-50ms critical latency and low cold starts?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2006", "title": "MLOps for INT8 Deployment on Coral Edge TPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What Coral Edge TPU data type constraint must the model artifact satisfy, and what CI/CD practice ensures it before deployment?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 0}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2007", "title": "Edge AI Model Deployment: Latency vs. Throughput on Jetson Orin", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can the 15 TOPS INT8 model meet a 50ms latency target on Jetson Orin, and what CI/CD profiling and MLOps steps would you add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2008", "title": "MLOps for Edge Deployment with Google Coral Edge TPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the MLOps pipeline for FP32-to-INT8 Coral Edge TPU deployments to ensure CI/CD, reproducibility, and training-serving consistency?", "chain_ids": ["edge-chain-auto-026-10"], "chain_positions": {"edge-chain-auto-026-10": 1}, "chain_tiers": {"edge-chain-auto-026-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2009", "title": "MLOps for LLM Deployment on Qualcomm Cloud AI 100 Edge Devices", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the MLOps lifecycle for frequent LLM updates on Qualcomm Cloud AI 100 while meeting 32GB memory, INT8, and 75W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2010", "title": "Edge MLOps Performance Regression on Jetson Orin", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the YOLOv8s FPS regression on Jetson Orin and improve MLOps so future CI/CD catches edge performance regressions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2011", "title": "Edge LLM OTA Update System Design for Qualcomm AI 100", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design OTA updates for 100,000 edge devices with 10GB firmware, <5min downtime, and immediate rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2012", "title": "Optimizing FOTA for NVIDIA Jetson Orin Fleets with A/B Partitions", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a reliable A/B FOTA strategy for remote devices under intermittent networks and power interruptions?", "chain_ids": ["edge-chain-auto-027-17"], "chain_positions": {"edge-chain-auto-027-17": 5}, "chain_tiers": {"edge-chain-auto-027-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2013", "title": "Optimizing Object Detection on Hailo-8 with Operator Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify and resolve Hailo-8 operator gaps, delegate unsupported ops, and weigh the delegation trade-offs?", "chain_ids": ["edge-chain-auto-secondary-006-11"], "chain_positions": {"edge-chain-auto-secondary-006-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2014", "title": "Optimizing ONNX Model Deployment on Qualcomm Cloud AI 100 with Operator Coverage Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify, diagnose, and resolve ONNX operator gaps and delegate work to maximize throughput on the Qualcomm Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-006-13"], "chain_positions": {"edge-chain-auto-secondary-006-13": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2015", "title": "Diagnosing Suboptimal ONNX Runtime Performance on Jetson Orin", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why the system only reaches 8 FPS with low GPU and high CPU usage, and what fixes would you try?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2016", "title": "Hailo-8 Vision Model Conversion with Custom Operators", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you handle unsupported PyTorch operators and fit the vision model within Hailo-8 latency, compute, and 2.5W limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2017", "title": "Optimizing Vision Model Deployment on Jetson Orin: TensorRT Conversion Challenges", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you find ONNX-to-TensorRT operator gaps and choose between plugins, layer substitution, or CPU fallback to hit sub-20 ms on Jetson Orin?", "chain_ids": ["edge-chain-auto-secondary-006-10"], "chain_positions": {"edge-chain-auto-secondary-006-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2018", "title": "Progressive Rollout Strategy for Edge ML on Qualcomm Cloud AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you roll out the new model progressively with traffic splitting, monitoring, and rollback, given edge fleet constraints?", "chain_ids": ["edge-chain-auto-secondary-011-03"], "chain_positions": {"edge-chain-auto-secondary-011-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2019", "title": "Analyzing Performance Degradation in Edge ML Model Rollouts", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the Jetson Orin canary drop from 60 to 30 FPS and P99 jump from 50 to 150 ms, and which metrics would you correlate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2020", "title": "Edge TPU Model Rollout Strategy with A/B Testing", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement a 48-hour 1% canary rollout of v2 across 10,000 intermittent Coral Edge TPU cameras with A/B comparison and rollback?", "chain_ids": ["edge-chain-auto-secondary-011-01"], "chain_positions": {"edge-chain-auto-secondary-011-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2021", "title": "Diagnosing Canary Rollout Failures on Edge AI with Qualcomm Cloud AI 100", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the canary group's P99 latency jump from 50 ms to 200 ms and error-rate increase on Qualcomm Cloud AI 100 devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2022", "title": "Edge ML Model Rollout on NVIDIA Jetson Orin", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an OTA rollout with observability, seamless model/config transitions, and rollback for a Jetson Orin factory fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2023", "title": "Hailo-8 Edge Deployment: Canary Release for Object Detection Model", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you run a 10% canary of the new Hailo-8 object detector across 1,000 devices, monitor it, and estimate power impact?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2024", "title": "Evaluating A/B Rollout Strategies for Edge AI Models on Google Coral TPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare canary and shadow rollout strategies for Model B on 10,000 Coral Edge TPU cameras, including KPIs, constraints, and rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2025", "title": "Canary Rollout and Optimization for Edge ML", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you implement a Jetson Orin canary rollout and monitor latency, throughput, power, and rollback triggers for regressions?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2026", "title": "Diagnosing End-to-End Latency in a Multi-Model Edge Pipeline on Hailo-8", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the 250 ms latency in the Hailo-8 two-model video pipeline when CPU utilization is low?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2027", "title": "Real-time Multi-Model Object Analysis on Google Coral Edge TPU", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the Coral Edge TPU compound detector-classifier pipeline to stay under 100 ms per frame and 2W?", "chain_ids": ["edge-chain-auto-secondary-002-10"], "chain_positions": {"edge-chain-auto-secondary-002-10": 3}, "chain_tiers": {"edge-chain-auto-secondary-002-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2028", "title": "Real-time Anomaly Detection on Jetson Orin: Multi-model RAG Pipeline Optimization", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which architecture, monolithic or microservices, is better for sub-100 ms on Jetson Orin, and what are the latency, memory, and power trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2029", "title": "Optimizing a Multi-Modal RAG Pipeline for Edge Deployment on Hailo-8", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design and deploy the Hailo-8 multimodal RAG pipeline to meet <200 ms latency, memory, routing, and 2.5W power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2030", "title": "Optimizing Chained Inference on Google Coral Edge TPU for Real-time Anomaly Detection", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks beyond raw TPU inference explain the 60 ms frame latency, and how would you optimize them to meet the 33 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2031", "title": "Real-time Data Drift Monitoring on Qualcomm Cloud AI 100", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor input data drift on Qualcomm Cloud AI 100 without significantly impacting real-time inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2032", "title": "Diagnosing Performance Degradation on Jetson Orin: Edge Drift Analysis", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you determine whether the accuracy drop is due to data or concept drift and detect it under edge constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2033", "title": "Real-time Data Drift Detection on Google Coral Edge TPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you detect lighting or camera-sensor data drift on Coral Edge TPUs using integer-friendly statistics without uploading raw data?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2034", "title": "Edge AI Drift Detection on Hailo-8", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you detect slow data drift on a production-line detector while minimizing inference overhead and staying within the 2.5W power impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2035", "title": "Detecting and Mitigating Data Drift on Jetson Orin for Edge Reliability", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and quantify data drift on Jetson Orin, mitigate it within resource limits, and measure restored reliability?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2036", "title": "Edge ML Drift Detection on Hailo-8 for Real-time Anomaly Systems", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an on-device Hailo-8 drift monitoring and reliability strategy that accounts for 26 TOPS INT8, 2.5W, and quantization effects?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2037", "title": "Graceful Degradation on Jetson Orin for Autonomous Vehicles", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design graceful degradation for a Jetson Orin perception system that misses its 50 ms deadline under load or sensor degradation?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2038", "title": "Graceful Degradation for Real-time Object Detection on Edge AI (Hailo-8)", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a graceful degradation strategy using ladders, fallbacks, and QoS shedding, distinguishing between fail-operational and fail-safe states?", "chain_ids": ["edge-chain-auto-secondary-011-06"], "chain_positions": {"edge-chain-auto-secondary-011-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-011-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2039", "title": "Graceful Degradation for Real-time Drone Navigation on Google Coral Edge TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Coral Edge TPU graceful degradation strategy for drone object detection using fallback models, QoS shedding, and fail-safe modes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2040", "title": "Diagnosing Graceful Degradation on Jetson Orin for Edge ML", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of the performance degradation and the failure of graceful fallback mechanisms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2041", "title": "Graceful Degradation for Edge Defect Detection on Hailo-8", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design degradation ladders, model fallbacks, fail-safe behavior, and QoS shedding for the Hailo-8 defect detector under a 3W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2042", "title": "Graceful Degradation for Edge Object Detection on Google Coral TPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design graceful degradation for a Coral Edge TPU industrial detector under stress while respecting INT8-only 2W limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2043", "title": "Designing a Graceful Degradation Strategy for Real-time ML on NVIDIA Jetson Orin", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design fail-operational graceful degradation on Jetson Orin using model fallbacks, degradation ladders, and QoS shedding under load or sensor faults?", "chain_ids": ["edge-chain-auto-secondary-011-04"], "chain_positions": {"edge-chain-auto-secondary-011-04": 3}, "chain_tiers": {"edge-chain-auto-secondary-011-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2044", "title": "Graceful Degradation on Hailo-8 for Edge Surveillance", "topic": "graceful-degradation", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Hailo-8 surveillance degradation strategy with ladders, model fallbacks, QoS shedding, and fail-safe versus fail-operational trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2045", "title": "ASIL-B Safety Architecture for Deterministic Edge Inference under 2W Power Constraints", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect edge safety software for ASIL-B deterministic inference, watchdogs, and self-tests under strict INT8 and 2W power constraints?", "chain_ids": ["edge-chain-auto-secondary-008-01"], "chain_positions": {"edge-chain-auto-secondary-008-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-008-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2047", "title": "Designing an ISO 26262 ASIL-B Perception System on Hailo-8", "topic": "safety-certification", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect an ASIL-B Hailo-8 perception pipeline with deterministic latency, watchdogs, self-tests, and fault responses within 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2048", "title": "Hailo-8 Hardware Security Features for Edge Adversarial Robustness", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What built-in hardware security features should a Hailo-8-class edge AI accelerator use to resist physical attacks and model extraction?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2049", "title": "Adversarial Robustness of INT8 Models on Coral Edge TPU for Security Applications", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does INT8-only inference on the 4 TOPS, 2W Coral Edge TPU affect adversarial patch susceptibility and feasible defense strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2050", "title": "Adversarial Robustness Design for INT8 Edge Image Classification", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you secure and harden an INT8 image classifier against adversarial inputs, extraction, and side channels within edge constraints?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2051", "title": "Quantifying Adversarial Defense Overhead on Edge AI for Real-time Systems", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the maximum extra latency an adversarial defense can add while maintaining a minimum real-time processing rate of 25 FPS, and what edge trade-offs follow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2052", "title": "Diagnosing Intermittent Object Detection Failures on Hailo-8 Due to Adversarial Input", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose whether subtle Hailo-8 INT8 object detection failures are targeted adversarial attacks under 2.5W edge constraints?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2053", "title": "Edge AI Reliability: Adversarial Robustness on Google Coral TPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the anomaly detector to maintain adversarial robustness within the strict INT8, 4 TOPS, and 2W hardware constraints?", "chain_ids": ["edge-chain-auto-secondary-007-20"], "chain_positions": {"edge-chain-auto-secondary-007-20": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2054", "title": "Mitigating Model Extraction on Edge AI with Qualcomm Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a defense strategy that minimizes performance impact while effectively mitigating the extraction risk?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2055", "title": "Mitigating Adversarial Patch Attacks on Hailo-8 Edge Deployments for Autonomous Vehicles", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a Hailo-8 defense against physical adversarial patches that maintains 30 FPS within 26 TOPS INT8 and 2.5W?", "chain_ids": ["edge-chain-auto-secondary-007-19"], "chain_positions": {"edge-chain-auto-secondary-007-19": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2056", "title": "Coral Edge TPU Adversarial Defense Performance Bottleneck", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 12 ms defense bottleneck on the Coral Edge TPU, and how would you optimize it to meet the 50 ms latency budget?", "chain_ids": ["edge-chain-auto-secondary-007-20"], "chain_positions": {"edge-chain-auto-secondary-007-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2057", "title": "Model Extraction Attack on Qualcomm Cloud AI 100", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you mitigate power-side-channel model extraction on the Cloud AI 100 despite secure boot and physical access?", "chain_ids": ["edge-chain-auto-secondary-007-21"], "chain_positions": {"edge-chain-auto-secondary-007-21": 3}, "chain_tiers": {"edge-chain-auto-secondary-007-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2058", "title": "Jetson Orin Edge Monitoring: Key Telemetry for Performance Degradation", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For basic recall-level monitoring, what specific telemetry metric is crucial to track to quickly detect compute-related performance degradation, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2059", "title": "Hailo-8 Edge Deployment: Diagnosing Performance Degradation through Telemetry", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "As a reliability engineer, specify what additional telemetry you would collect and how you would correlate it to pinpoint why only some Hailo-8 devices show rising latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2060", "title": "Edge TPU Fleet Reliability: MTBF Calculation for 99.9% Uptime", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What MTBF in hours is required to achieve 99.9% uptime per Edge TPU device with a 4-hour MTTR?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2061", "title": "Diagnosing Latency Spikes and Timeouts on Qualcomm Cloud AI 100 for Edge ML", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you approach diagnosing the root cause of this degradation and what specific metrics would you prioritize for investigation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2062", "title": "Real-time Edge TPU Monitoring Strategy", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design monitoring and observability for 1000 Coral Edge TPUs, including telemetry, alerts, MTBF/MTTR, stragglers, and dashboards?", "chain_ids": ["edge-chain-auto-secondary-007-24"], "chain_positions": {"edge-chain-auto-secondary-007-24": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2063", "title": "Diagnosing Edge AI p99 Latency and Power Improvements", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose intermittent p99 latency on edge devices, and estimate the latency and power improvement from an optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2064", "title": "Edge AI Observability for Autonomous Perception on Jetson Orin", "topic": "monitoring-observability", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design monitoring and observability for Jetson Orin autonomous vehicles, covering telemetry, alerts, MTBF/MTTR, stragglers, and dashboards?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2065", "title": "Optimizing Real-time Edge ML Data Pipelines with Coral TPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where are the throughput bottlenecks in the Coral Edge TPU pipeline for 30 FPS, given 5 ms CPU preprocessing, 100 GOP inference, and INT8-only support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2066", "title": "Jetson Orin: Real-time Multi-Stream Video Pipeline Bottleneck Analysis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where would you investigate, and how would you apply the data pipeline equation to diagnose and optimize the system for sustained performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2067", "title": "Real-time Edge Video Pipeline Optimization Under Uplink Limits", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you optimize this video pipeline to avoid 500 ms spikes and buffer overflows under a 5 Mbps uplink budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2068", "title": "Edge TPU Data Pipeline Bottleneck Analysis for Real-time Anomaly Detection", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use the data pipeline equation to diagnose the Coral TPU ETL bottleneck and optimize 10 KB JSON messages at 10 Hz for real time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2069", "title": "Edge Data Quality for Hailo-8 Deployments", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you architect edge-side data quality validation for Hailo-8 metadata so only fresh, valid, accurate data reaches the data lake?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2070", "title": "Edge Data Quality and Anomaly Detection on Coral TPU", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a lightweight INT8 data quality and anomaly pipeline on Coral Edge TPUs without exceeding 2W or harming fault detection latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2071", "title": "Diagnosing Data Quality Degradation on Edge AI Accelerator", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and localize upstream data quality issues causing accuracy loss and latency on a remote Qualcomm Cloud AI 100 edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2072", "title": "Edge Data Integrity Real-time Validation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time data quality system on Jetson Orin that enforces schemas, drift gates, and anomaly checks within 50ms and 60W?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2073", "title": "Edge Data Validation on Hailo-8 for Real-time ML", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a robust data validation pipeline for the 30 FPS video stream while minimizing overhead on the Hailo-8's 2.5W power budget?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2074", "title": "Edge Data Integrity: Coral TPU Architecture Evaluation", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Would you perform sensor validation on the Edge TPU, the host CPU, or a hybrid path, and what trade-offs drive that choice?", "chain_ids": ["edge-chain-auto-secondary-009-23"], "chain_positions": {"edge-chain-auto-secondary-009-23": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2075", "title": "Real-time Edge Data Quality on Qualcomm Cloud AI 100", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you implement edge data contracts, schema validation, quality gates, and anomaly detection within the 75W constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2076", "title": "Edge Data Quality & Validation for Anomaly Detection on NVIDIA Jetson Orin", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose sensor-data quality degradation and add validation gates on Jetson Orin without breaking its 60W and latency constraints?", "chain_ids": ["edge-chain-auto-secondary-009-24"], "chain_positions": {"edge-chain-auto-secondary-009-24": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2077", "title": "Edge Data Quality and Validation for Critical Hailo-8 Deployments", "topic": "data-quality-validation", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time data quality pipeline for a Hailo-8 drone that enforces data contracts and detects sensor anomalies without cloud reliance?", "chain_ids": ["edge-chain-auto-secondary-009-22"], "chain_positions": {"edge-chain-auto-secondary-009-22": 4}, "chain_tiers": {"edge-chain-auto-secondary-009-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2078", "title": "Active Learning for Edge Model Adaptation on NVIDIA Jetson Orin", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What edge-friendly model update strategy would you use when raw data cannot be streamed for relabeling over the limited satellite uplink?", "chain_ids": ["edge-chain-auto-secondary-003-22"], "chain_positions": {"edge-chain-auto-secondary-003-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2079", "title": "Edge Active Learning and Bias Mitigation on Hailo-8", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you select Hailo-8 traffic samples for annotation, estimate uncertainty efficiently, and mitigate bias under 26 TOPS INT8 and 2.5W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2080", "title": "Edge AI Dataset Curation for Rare Events on Coral TPU", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate, label, and use edge-selected data to improve rare-class detection on a Coral Edge TPU with intermittent connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2081", "title": "Active Learning Labeling Break-Even for Edge Image Classification", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "At what labeled-sample count does active learning become cheaper than random sampling given 0.50 labels, 30% fewer labels, and 5,000 upfront cost?", "chain_ids": ["edge-chain-auto-secondary-003-25"], "chain_positions": {"edge-chain-auto-secondary-003-25": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2082", "title": "Coral Edge TPU Disease Detection: Active Learning for Rare Disease Data Curation", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design and quantify an active learning loop to improve rare plant disease detection on a Coral Edge TPU under INT8 and annotation-budget constraints?", "chain_ids": ["edge-chain-auto-secondary-003-23"], "chain_positions": {"edge-chain-auto-secondary-003-23": 0}, "chain_tiers": {"edge-chain-auto-secondary-003-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2083", "title": "Optimizing Edge AI Dataset Curation for Qualcomm Cloud AI 100 Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you compare traditional batch annotation versus active learning for Cloud AI 100 defect detection and choose a curation strategy?", "chain_ids": ["edge-chain-auto-secondary-003-25"], "chain_positions": {"edge-chain-auto-secondary-003-25": 1}, "chain_tiers": {"edge-chain-auto-secondary-003-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2084", "title": "Optimizing Dataset Curation for Hailo-8 Edge Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the bottleneck in the current data pipeline and quantify the potential improvement of your proposed solution, leveraging the NPU capabilities?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2085", "title": "50ms Edge Anomaly Detection for 100 Hz Sensor Streams", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect a 50ms edge anomaly detection pipeline for hundreds of 100 Hz sensors with intermittent cloud connectivity?", "chain_ids": ["edge-chain-auto-secondary-014-11"], "chain_positions": {"edge-chain-auto-secondary-014-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2086", "title": "Real-time Sensor Data Ingestion on NVIDIA Jetson Orin", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What raw sensor bandwidth is required, can the Orin's 68 GB/s effective LPDDR5 handle it, and how would you provision resources for these tasks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2087", "title": "Edge Real-Time Anomaly Detection with Qualcomm Cloud AI 100", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What challenges and design choices matter for real-time processing of 1.2 GB/s camera and LIDAR streams on a Qualcomm Cloud AI 100 edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2088", "title": "Optimizing Real-time Edge Inference on Coral TPU for Defect Detection", "topic": "streaming-ingestion", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What bottlenecks are limiting the Edge TPU pipeline to 15 FPS, and how would you optimize and measure progress toward 30 FPS?", "chain_ids": ["edge-chain-auto-secondary-014-13"], "chain_positions": {"edge-chain-auto-secondary-014-13": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2089", "title": "Hailo-8 Edge Data Pipeline: Optimizing Storage for Efficient Inference", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L1", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which storage format and compression traits would you choose for 500 MB/s patch ingestion on the Hailo-8, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2090", "title": "Edge TPU Storage: Optimizing Data Formats for Real-time Inference Buffering", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you store and compress 12 hours of inference results at 5 FPS for energy-efficient, reliable local buffering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2091", "title": "Optimizing Data Storage for On-Device Inference on Qualcomm Cloud AI 100", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What storage format strategy would you use for sensor data, intermediate results, and checkpoints on Cloud AI 100 to balance compression, speed, and LPDDR4x use?", "chain_ids": ["edge-chain-auto-secondary-008-34"], "chain_positions": {"edge-chain-auto-secondary-008-34": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2092", "title": "Optimizing Sensor Data Storage on NVIDIA Jetson Orin for Edge ML", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which format and compression optimize Orin analytics plus upload, and how many days of 10-feature float32 records fit in 64 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2093", "title": "Edge AI Feature Store Optimization for Real-time Inference", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which storage format and compression strategy would you choose for 50MB frames at 100 FPS on the edge accelerator, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2094", "title": "Optimizing Edge ML Data Storage on NVIDIA Jetson Orin for Autonomous Drones", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which two local storage architectures best trade off footprint, power, and replay fidelity for 24h of 1080p30 video on a 256GB Orin?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2095", "title": "Edge AI Data Storage Optimization on Qualcomm Cloud AI 100", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design the local storage pipeline for 72 hours of sensor data on Cloud AI 100 while minimizing I/O and memory overhead?", "chain_ids": ["edge-chain-auto-secondary-008-34"], "chain_positions": {"edge-chain-auto-secondary-008-34": 2}, "chain_tiers": {"edge-chain-auto-secondary-008-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2096", "title": "Data Pruning for Edge Deployment on Qualcomm Cloud AI 100", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is data pruning, how does it differ from coreset selection, and when would it help deploy on the Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2097", "title": "Edge Anomaly Detection and the Data Wall on Jetson Orin", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do the data wall and poor ICR arise, and what data selection strategy would you use to fix them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2098", "title": "Edge ML with Hailo-8: Data Efficiency for Continuous Adaptation", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you select and process data for continuous learning on Hailo-8 without exceeding the power budget or risking model collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2099", "title": "Data Pruning for Edge TPU Deployment", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantify the potential benefits of such a strategy, specifically targeting the limitations of the Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2100", "title": "Diagnosing Data Wall Challenges on Qualcomm Cloud AI 100 for Edge ML", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data-efficient strategy resolves this 'data wall' while remaining within the strict 32 GB and 75W edge hardware constraints?", "chain_ids": ["edge-chain-auto-secondary-007-05"], "chain_positions": {"edge-chain-auto-secondary-007-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2101", "title": "Data-Efficient Continuous Learning for Anomaly Detection on NVIDIA Jetson Orin", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the Jetson Orin data pipeline to avoid the data wall and model collapse during daily on-device fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2102", "title": "Optimizing Data for Edge AI on Hailo-8", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you approach identifying and mitigating this issue, specifically leveraging data efficiency and selection techniques given the Hailo-8's constraints?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2103", "title": "Optimizing Data Efficiency for Edge Deployment on NVIDIA Jetson Orin", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the data bottleneck, use coreset selection or pruning, and quantify gains toward 30 FPS and 90% mAP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2104", "title": "Edge Data Efficiency for Real-time Object Detection on Hailo-8", "topic": "data-efficiency-selection", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use data selection to reduce 1080p 30 FPS drone input while preserving on-device adaptation capabilities?", "chain_ids": ["edge-chain-auto-secondary-007-06"], "chain_positions": {"edge-chain-auto-secondary-007-06": 4}, "chain_tiers": {"edge-chain-auto-secondary-007-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2105", "title": "Federated Learning on Hailo-8 Edge Devices: Power-Efficient Convergence with Non-IID Data", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do model architecture, local epochs, and aggregation frequency trade off convergence, communication, and energy on Hailo-8 federated learning?", "chain_ids": ["edge-chain-auto-017-02"], "chain_positions": {"edge-chain-auto-017-02": 0}, "chain_tiers": {"edge-chain-auto-017-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2106", "title": "Designing Cross-Device Federated Learning on Google Coral Edge TPUs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design federated averaging for Coral Edge TPU devices with CPU-side updates, communication limits, and non-IID data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2107", "title": "Optimizing Federated Learning Communication on Edge AI Accelerators", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose and reduce the federated learning communication bottleneck for 10M float32 weights from the client devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2108", "title": "Fairness Evaluation on Edge AI: Qualcomm Cloud AI 100", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the root causes of demographic approval disparities on the INT8 credit scoring deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2109", "title": "On-Device Fairness Evaluation with Hailo-8", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design this monitoring mechanism, and what formula tracks the FNR difference between groups?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2110", "title": "Diagnosing Bias in Edge TPU Models for Fairness", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose demographic bias on the INT8-only Coral Edge TPU across data, on-device evaluation, and quantization effects?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2111", "title": "Fairness-Aware Model Deployment on Edge AI Accelerator", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design on-device inference and lightweight fairness monitoring for demographic parity and equalized odds on Cloud AI 100?", "chain_ids": ["edge-chain-auto-secondary-009-30"], "chain_positions": {"edge-chain-auto-secondary-009-30": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-30": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2112", "title": "Fairness Evaluation on Edge Devices with NVIDIA Jetson Orin", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate and monitor demographic parity for pedestrian detection on the Jetson Orin within compute and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2113", "title": "Fairness Evaluation on Hailo-8 Edge Deployment", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a comprehensive fairness evaluation and mitigation strategy tailored to the computational constraints of the Hailo-8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2114", "title": "Responsible AI Deployment on Edge: Jetson Orin's Predictive Maintenance Bias Analysis", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you use model cards, impact assessments, and red-teaming on Jetson Orin to explain facility-specific false positives and misses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2115", "title": "Investigating False Positives in Industrial Edge Anomaly Detection After Factory Deployment", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you investigate the false positives at the new factory using data drift analysis, model cards, and impact assessments?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2116", "title": "Responsible AI on Edge: Real-time Governance for Autonomous Drones", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you architect the responsible AI pipeline to balance real-time guardrails with comprehensive impact assessments under the 60W constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2117", "title": "Responsible AI Evaluation on Edge TPU: Quantized Model Comparison", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compare Model A and Model B with model cards, explainability, failure modes, metrics, and guardrails on Coral Edge TPU?", "chain_ids": ["edge-chain-auto-secondary-010-04"], "chain_positions": {"edge-chain-auto-secondary-010-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-010-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2118", "title": "Responsible AI Optimization on Edge: Bias Mitigation for Real-time Object Detection on NVIDIA Jetson Orin", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the P99 latency spikes and propose a quantifiable fix that also addresses demographic bias at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2119", "title": "Responsible AI for Edge Autonomous Systems on Hailo-8", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build an edge-native Responsible AI and governance framework for Hailo-8 pedestrian detection in delivery robots?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2120", "title": "INT8 Quantization Impact on Regression Heads", "topic": "mixed-precision-training", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does INT8 quantization make box regression jittery while classification stays accurate, and what FPS can 65 GOPS/frame achieve at 50% Hailo-8 utilization?", "chain_ids": ["edge-chain-auto-secondary-011-29"], "chain_positions": {"edge-chain-auto-secondary-011-29": 0}, "chain_tiers": {"edge-chain-auto-secondary-011-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2121", "title": "Queuing Backpressure on Jetson Orin Edge Inference", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 30 FPS Jetson Orin pipeline queue grow with 40ms inference, and what fixes would make it stable?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2122", "title": "Little's Law Memory Sizing for Hailo-8 Inference Buffer", "topic": "queueing-theory", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using Little's Law, how much buffer memory is needed for 50 events/s with 15ms processing and 4KB events, and is 256KB SRAM enough?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2123", "title": "Coral Edge TPU Throughput Bound via Little's Law", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Using Little's Law, is 30 inferences/s plausible with 28ms Coral latency, and what mean concurrency L would it require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2125", "title": "Datacenter vs Edge Efficiency Trade-off for Video Analytics", "topic": "tco-cost-modeling", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do cloud streaming and Jetson Orin edge processing compare in total power and 3-year TCO for 1000 traffic cameras?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2128", "title": "Tail Latency from Thermal Throttling on Jetson Orin", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the Jetson Orin P99 latency to jump from 28ms to 95ms at 40°C, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2129", "title": "Interrupt Latency Spikes in Real-Time Edge Inference", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does Hailo-8 ResNet-50 P99 latency hit 40ms during network bursts, and how would you reduce the host-side tail latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2130", "title": "Coral Edge TPU Queue Depth and Tail Latency", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Coral MobileNetV2 P99 latency rise to 55ms under Raspberry Pi load, and how would you reduce it without blaming USB3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2135", "title": "Roofline Analysis for CNN Inference on Jetson Orin Nano", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Where does ResNet-50 conv1 land on the Orin roofline, and is compute or memory bandwidth limiting the 18 TOPS result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2136", "title": "Sliding Window Attention for Long Context on Jetson AGX Orin", "topic": "attention-scaling", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For a batch of two 16K-token sequences, how do full attention and 4096-token SWA compare in KV memory and decode bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2137", "title": "INT4 Weight-Only Quantization for LLM on Hailo-8", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What packed W4A8 weight size fits in 4GB, and how does INT4 dequantization affect Hailo-8 throughput?", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 1}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2138", "title": "LPDDR5 vs On-Chip SRAM Trade-offs on Jetson Orin for Real-Time Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For YOLOv8-m at 30fps on Orin NX, what memory access pattern matters and what is the binding constraint?", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 2}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2139", "title": "Micro-Batch Pipeline Scheduling on Jetson Orin for Video Analytics", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If detection takes 15ms and classification 5ms per frame, what throughput and latency result from pipelining them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2140", "title": "Power Budget Allocation for Multi-Model Pipeline on Jetson Orin 60W", "topic": "power-budgeting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the peak power of this Orin pipeline, and can DLA-offloading detection fit it within a 45W TDP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2141", "title": "Ring All-Reduce vs Gossip on 1GbE for 100MB Federated Learning Sync", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "For 5 nodes syncing a 100MB model over 1GbE, how do ring all-reduce and gossip protocol compare in latency, and what is the relative overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2142", "title": "TensorRT INT8 Calibration and Latency Optimization on Jetson Orin", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you calibrate YOLOv8-l for INT8 on Orin, and what latency-accuracy trade-off should entropy calibration achieve?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2143", "title": "Knowledge Distillation for Compressing ViT to MobileNet on Hailo-8", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With feature-based distillation from ViT-B/16 to MobileNetV3-small on Hailo-8, what top-1 accuracy is realistically achievable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2144", "title": "Thermal Design for Fanless Jetson Orin in IP67 Enclosure", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 40°C ambient with 11°C/W total thermal resistance, what TGP can the sealed Orin NX sustain and will 15W throttle?", "chain_ids": ["edge-chain-auto-secondary-006-14"], "chain_positions": {"edge-chain-auto-secondary-006-14": 2}, "chain_tiers": {"edge-chain-auto-secondary-006-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2145", "title": "Quantization Error Localization for Accuracy Drop on Coral Edge TPU", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the 2.5% INT8 accuracy drop when layers 12-15 are the most quantization-sensitive on Coral Edge TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2146", "title": "OTA Payload Comparison for Full, Delta, and LoRA Edge Model Updates", "topic": "pruning-sparsity", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Over 10 Mbps LTE, how do full 50MB OTA updates, weight deltas, and LoRA-style incremental updates compare for the deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2147", "title": "Real-Time Inference Pipeline Specification for Autonomous Drone on Jetson Orin", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you allocate GPU, DLA, and other compute for the drone's 60fps detection, 10fps segmentation, and 100Hz localization within 60W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2148", "title": "Model Quantization Selection for Coral Edge TPU with Accuracy Constraint", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which INT8 strategy meets >20fps and <5% rank-1 loss, and why should mixed FP16 precision be avoided on this TPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2149", "title": "Depthwise Separable Conv vs Standard Conv Roofline on Jetson Orin DLA", "topic": "roofline-analysis", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "On Orin DLA, how do 3x3 standard and depthwise separable 64->64 convs compare in FLOPs, parameters, and roofline position?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2150", "title": "Thermal Runaway Prevention for Sustained AI Workload on Hailo-8", "topic": "thermal-management", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 55°C ambient and θ_ja=20°C/W, what power can the accelerator sustain and does a 2.5W continuous workload need throttling?", "chain_ids": ["edge-chain-auto-secondary-006-15"], "chain_positions": {"edge-chain-auto-secondary-006-15": 1}, "chain_tiers": {"edge-chain-auto-secondary-006-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2151", "title": "Knowledge Distillation from Transformer to CNN for Hailo-8", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What mAP gain can feature distillation from ViT-S to ResNet-18 deliver on Hailo-8, and what representation issues arise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2152", "title": "INT8 Activation Quantization Range Calibration for Object Detection on Jetson Orin", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does YOLOv8-m lose 3.8% mAP in daytime after nighttime-only INT8 calibration, and how would you fix it?", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 0}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2153", "title": "Model Partitioning for Collaborative Inference: Edge+Cloud Split on Jetson Orin", "topic": "model-serving-infrastructure", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "With a 5 Mbps uplink, where should you split the 50-layer ResNet-152 between Jetson Orin and cloud to minimize latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2154", "title": "Multi-Frame Batch Accumulation for Throughput on Jetson Orin vs Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What batch size should YOLOv8-s use to maximize throughput while keeping event alerts under 200ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2161", "title": "Depthwise Convolutions on Jetson Orin", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing 3×3 convolutions with depthwise separable convolutions cut FLOPs 8× but improve Orin latency only 2×?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2162", "title": "Inference Latency Spikes During OTA", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Cortex-M7 inference latencies spike from 15ms to exactly 55ms during OTA writes to the inactive flash partition?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2163", "title": "Thermal Throttling Frame Drops", "topic": "real-time-deadlines", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the drone pipeline miss its 33.3ms deadline after clocks are halved, even though throttled TOPS still exceed the workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2165", "title": "TFLite INT8 Conversion on Cortex-M4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the INT8 weight size in flash and remaining SRAM tensor arena on a 256,000-byte SRAM Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2166", "title": "ONNX Runtime TensorRT EP vs Standalone TensorRT on Jetson Orin", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is the portability of ONNX Runtime worth the 12% latency overhead over standalone TensorRT, and how can it be mitigated?", "chain_ids": ["edge-chain-auto-secondary-016-11"], "chain_positions": {"edge-chain-auto-secondary-016-11": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-11": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2167", "title": "SM80 Kernel Occupancy Limits on SM87 Jetson Ampere GPUs", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the sm_80 CUDA kernel achieve 25% lower throughput than expected on the sm_87 edge GPU despite both being Ampere architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2168", "title": "Portable Model Optimization Pipeline for Multi-Edge Deployment", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a portable deployment pipeline for the same perception model across Orin, Qualcomm SA8650P, and TI TDA4VM?", "chain_ids": ["edge-chain-auto-secondary-016-11"], "chain_positions": {"edge-chain-auto-secondary-016-11": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2169", "title": "Jetson Orin Nano TFLite GPU Delegate vs ONNX Runtime CUDA EP", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which runtime yields better throughput per watt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2170", "title": "Handling INT8 Quantization Discrepancies Across Edge Runtimes", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is ORT INT8 quantization 2.3 mAP worse than TensorRT on the same calibration set, and how would you close the gap?", "chain_ids": ["edge-chain-auto-secondary-016-11"], "chain_positions": {"edge-chain-auto-secondary-016-11": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-11": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2171", "title": "Power-Aware Runtime Selection on Jetson Orin Power Profiles", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should the Orin inference service adapt its model and TensorRT backend as nvpmodel switches from 60W to 15W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2172", "title": "CUDA-to-Vulkan Compute Porting for Cross-Vendor Edge GPUs", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Should you port the custom CUDA inference kernels to Vulkan Compute for Jetson and Intel Arc, or use a higher-level abstraction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2173", "title": "Model Format Conversion Fidelity: PyTorch to CoreML to ONNX Round-Trip", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you debug the 0.05 FP32 output mismatch from PyTorch->CoreML->ONNX, and what conversion flow should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2174", "title": "Heterogeneous Compute Dispatch on Jetson Orin: GPU vs DLA vs CPU", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you dispatch perception, planning, and SLAM across Orin's GPU, DLAs, and ARM CPU within the 60W budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2175", "title": "OTA Model Update Portability for Heterogeneous Edge Fleets", "topic": "software-portability", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you package and validate an OTA model and post-processing update for the 5000-device heterogeneous fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2181", "title": "Tail Latency on Edge with Thermal Throttling", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you maintain P99 below 25ms within the 60W power budget?", "chain_ids": ["edge-chain-auto-secondary-016-12"], "chain_positions": {"edge-chain-auto-secondary-016-12": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2182", "title": "Latency Jitter from DVFS on Edge Devices", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you stabilize latency without exceeding the 60W power budget?", "chain_ids": ["edge-chain-auto-secondary-016-12"], "chain_positions": {"edge-chain-auto-secondary-016-12": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2183", "title": "Priority Preemption for Real-Time Edge Inference", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you prevent the lane model from pushing the pedestrian detector past 15ms while staying within the 60W envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2184", "title": "Deterministic Latency with TensorRT on Orin", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What P99 latency should you expect after converting the PyTorch model to fixed-shape TensorRT, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2185", "title": "Memory Controller Contention on Edge SoCs", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much LPDDR5 bandwidth contention is causing the 40 ms P99 spikes, and how would you restructure preprocessing to reduce them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2186", "title": "Register Pressure on Jetson Orin Ampere SMs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might reducing register usage to 64 registers per thread (causing register spilling) decrease performance despite increasing occupancy?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 2}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2187", "title": "Tensor Core Availability on Edge GPUs", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "For a 3x3 depthwise conv with 320 channels over 7x7 positions, how many FLOPs is it, and why does this operation fail to utilize Tensor Cores?", "chain_ids": ["edge-chain-auto-secondary-007-22"], "chain_positions": {"edge-chain-auto-secondary-007-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-007-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2188", "title": "INT8 Quantization and Tensor Core Throughput on Orin", "topic": "gpu-compute-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected INT8 latency for the 22ms FP16 path after accounting for memory, quantization, mixed-precision, and overhead terms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2189", "title": "Storage Format for Edge Inference on Jetson Orin", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long will 500K individual JPEG reads take from NVMe, and what storage format would meet the 10-minute recalibration budget?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2190", "title": "FlatBuffers vs Protobuf for Edge Model Serving", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did TFLite choose FlatBuffers, and what performance property would you lose by switching to Protobuf?", "chain_ids": ["edge-chain-auto-secondary-008-32"], "chain_positions": {"edge-chain-auto-secondary-008-32": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2191", "title": "Delta-Compressed ONNX Updates over Cellular Links", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does a 200 MB ONNX update take over 50 Mbps, and what update format would cut bandwidth by at least 80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2192", "title": "Sparse Format for Edge Model Weights", "topic": "storage-format-selection", "competency_area": "data", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Should you store the weights in a sparse format (CSR or bitmap+values), and how does this affect inference latency vs memory savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2193", "title": "Device-Level DP Noise Calibration for Edge Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose clipping norm and noise multiplier for device-level DP over 100 rounds with 100 of 1000 devices sampled?", "chain_ids": ["edge-chain-auto-secondary-017-40"], "chain_positions": {"edge-chain-auto-secondary-017-40": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2194", "title": "Privacy Budget for Continuous Edge Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Using the moments accountant, what noise multiplier σ is needed, and at what point does the model stop improving because the noise overwhelms the gradient signal?", "chain_ids": ["edge-chain-auto-secondary-017-40"], "chain_positions": {"edge-chain-auto-secondary-017-40": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-40": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2195", "title": "Tensor Arena Planning for Jetson Orin Multi-Model Serving", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you plan memory allocation to prevent fragmentation?", "chain_ids": ["edge-chain-auto-secondary-017-43"], "chain_positions": {"edge-chain-auto-secondary-017-43": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-43": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2216", "title": "Exporting a model with custom autograd to TensorRT for Jetson Orin deployment", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you export custom autograd NMS to TensorRT, and what is the theoretical latency for a 4 GOP INT8 model on a 275 TOPS Orin engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2217", "title": "Reducing activation memory for on-device fine-tuning on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the FP32 activation memory for a batch=8, 96-channel, 28x28 feature map, and why is 7.2 MB an incorrect estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2218", "title": "Trace-based optimization of computational graph for Jetson Orin deployment", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you optimize the 45 ms PyTorch YOLOv8n model for 30 fps inference on Jetson Orin without gradient overhead?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2219", "title": "Implementing efficient backward pass for continual learning on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you compute the Fisher diagonal over 1,000 samples for a 10M-parameter EWC model efficiently on the edge device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2220", "title": "Debugging gradient flow through quantization-aware training on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should FakeQuant be implemented so gradients flow during QAT for the INT8 target?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2221", "title": "Memory-efficient inference graph for multi-model pipeline on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you reduce RAM and startup overhead for the three-model Orin pipeline while keeping inference correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2222", "title": "Implementing sparse gradient updates for efficient edge fine-tuning", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement top-k sparse gradient updates for the 20M-parameter transformer to reduce future federated communication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2223", "title": "Power-aware training schedule using autograd profiling on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you keep adaptive fine-tuning on Orin within a 25 W budget without hitting the 25 ms thermal-throttled regime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2224", "title": "Federated learning gradient compression with autograd hooks on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you apply PowerSGD to reduce memory pressure and test whether 120 MB gradients fit a 1 Mbps federated round?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2225", "title": "Analyzing computational graph overhead for real-time control on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the compute-bound inference time for a 4M-op controller on a 275 TOPS Orin, and what unit error makes it 14.5 µs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2226", "title": "Optimizing memory layout of saved tensors for backward on constrained Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you eliminate CPU-GPU activation offload bottlenecks during backward for the depthwise separable model on Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2227", "title": "Implementing efficient knowledge distillation with autograd on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you reduce activation memory for on-device distillation when the 20M-parameter teacher is frozen?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2228", "title": "Analyzing when torch.compile helps vs hurts on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why might torch.compile make this 5M-parameter Orin inference model slower, and how would you decide whether to keep it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2229", "title": "Graph capture stability for streaming inference on Jetson Orin", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should you manage LSTM input and hidden-state buffers when replaying a CUDA graph for streaming audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2230", "title": "Energy-optimal gradient computation schedule for battery-powered edge device", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How many learning steps can you afford per hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2231", "title": "Chiplet Design for Edge Inference Power Constraints", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is a 2x5W chiplet design preferable to one 10W monolithic die for an edge inference accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2232", "title": "Shared LPDDR5 bandwidth contention on Jetson Orin", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does Jetson Orin's unified LPDDR5 architecture differ from MI300X, and how do NUMA-like bandwidth effects appear?", "chain_ids": ["edge-chain-auto-secondary-017-03"], "chain_positions": {"edge-chain-auto-secondary-017-03": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2233", "title": "Die-to-Die Power Gating at Edge", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What power-gating strategy keeps an ISP/ML chiplet edge device under idle power targets while meeting a <10 ms wake latency?", "chain_ids": ["edge-chain-auto-secondary-017-02"], "chain_positions": {"edge-chain-auto-secondary-017-02": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2234", "title": "Interposer vs Package-on-Package for Edge Chiplets", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which packaging technology is preferable for a wearable device targeting sub-300mW?", "chain_ids": ["edge-chain-auto-secondary-017-01"], "chain_positions": {"edge-chain-auto-secondary-017-01": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2235", "title": "Heterogeneous Chiplet ISA Compatibility", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What software stack challenges arise when TFLite must target ARM CPU chiplets plus a custom RISC-V ML accelerator die?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2236", "title": "Continual LoRA Adaptation on Edge Devices", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you update the weekly LoRA adapter on the edge device without catastrophic forgetting of prior robot behaviors?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2237", "title": "LoRA Inference on Jetson Orin: Adapter Fusion Strategy", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Should you merge the LoRA adapter before deployment or apply it dynamically during inference on Jetson Orin?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2238", "title": "PEFT Memory Budgets on Jetson Orin for On-Device Fine-tuning", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What memory budget and training throughput should you expect for on-device LoRA fine-tuning of a 1B model on Jetson Orin?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2239", "title": "Adapter Compression for Bandwidth-Constrained Edge Deployment", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you deliver LoRA adapter updates to 10,000 edge devices over 1 Mbps cellular links?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2240", "title": "Adapter Rollback and Version Management at Edge", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you roll back and diagnose a LoRA adapter OTA update when 3% of 10,000 Orin devices show higher error rates?", "chain_ids": ["edge-chain-auto-secondary-016-10"], "chain_positions": {"edge-chain-auto-secondary-016-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2241", "title": "On-Device DP Inference for Medical Wearables", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How does adding LDP noise to logits compare with keeping the 500K-parameter ECG classifier entirely on-device?", "chain_ids": ["edge-chain-auto-secondary-017-41"], "chain_positions": {"edge-chain-auto-secondary-017-41": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2242", "title": "DP Noise Calibration for Sensor Fusion on Edge", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did per-feature LDP noise raise false negatives from 2% to 18%, and how would you fix the sensor fusion pipeline?", "chain_ids": ["edge-chain-auto-secondary-017-41"], "chain_positions": {"edge-chain-auto-secondary-017-41": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-41": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2243", "title": "Fairness Degradation from Quantization on Edge", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did INT8 PTQ widen the skin-tone FPR gap from 1.8% to 6.2%, and how would you fix the calibration process?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2244", "title": "Impact Assessment for Autonomous Medical Triage on Edge", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What components and quantitative thresholds must the AI impact assessment include before deploying the hospital triage assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2245", "title": "Edge Training Facility PUE and WUE Accounting", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the annual PUE overhead energy waste, water consumption, savings from cloud migration, and break-even PUE/WUE for on-prem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2246", "title": "Embodied Carbon Dominance in TinyML Edge Deployment", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the embodied-versus-operational carbon split, and does reducing the Pi workload from 5W to 2W meaningfully cut total carbon?", "chain_ids": ["edge-chain-auto-secondary-017-53"], "chain_positions": {"edge-chain-auto-secondary-017-53": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-53": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2247", "title": "Idle Power Governance for Always-On Edge Inference", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Quantify the carbon and cost impact of the current idle waste and evaluate the technician's proposal?", "chain_ids": ["edge-chain-auto-secondary-017-53"], "chain_positions": {"edge-chain-auto-secondary-017-53": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-53": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2248", "title": "Carbon Footprint for Continuous Edge Inference Deployment", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "edge", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the annual carbon footprint of the 5,000-node edge fleet versus H100 cloud inference, and where is the carbon break-even?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2249", "title": "Jetson Orin Thermal Throttling and Inference Latency Tail", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the afternoon p99 latency spike, how does thermal throttling explain it, and how would you harden the deployment?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2250", "title": "Stochastic Input Variance and P99 Latency on Jetson Orin Vision Pipelines", "topic": "tail-latency", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the current architecture meet the p99 < 40ms SLA, and what changes would bring it under the deadline?", "chain_ids": ["edge-chain-auto-secondary-016-12"], "chain_positions": {"edge-chain-auto-secondary-016-12": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2251", "title": "Chiplet Thermal Envelope for Edge AI SoC", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What thermal solution would keep the 60W chiplet SoC from throttling at 55°C ambient?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2252", "title": "Die-to-Die Bandwidth Adequacy for Edge Vision Pipeline", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is the 50 GB/s die-to-die link the bottleneck for this 4K60 vision pipeline?", "chain_ids": ["edge-chain-auto-secondary-017-02"], "chain_positions": {"edge-chain-auto-secondary-017-02": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2253", "title": "Power Island Gating Across Chiplet Dies for Edge Battery Life", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What power-gating strategy should you use, and can a 10Wh battery support 72 hours with 8 hours of total NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2254", "title": "POP vs 2.5D Interposer for Edge AI Module Cost", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "At 500K units/year, how do PoP and 2.5D interposer integration compare on bandwidth and cost?", "chain_ids": ["edge-chain-auto-secondary-017-01"], "chain_positions": {"edge-chain-auto-secondary-017-01": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2255", "title": "NUMA-Aware Runtime Scheduling on Embedded Chiplet", "topic": "chiplet-architecture", "competency_area": "compute", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you fix the 25% ML inference latency penalty caused by OS thread migrations between the two CPU dies?", "chain_ids": ["edge-chain-auto-secondary-017-03"], "chain_positions": {"edge-chain-auto-secondary-017-03": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2256", "title": "LoRA Adapter Update Over-the-Air for Edge Devices", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the OTA LoRA adapter update system for 10,000 devices to minimize bandwidth and ensure reliability?", "chain_ids": ["edge-chain-auto-secondary-016-10"], "chain_positions": {"edge-chain-auto-secondary-016-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2257", "title": "On-Device LoRA Inference with INT4 Weight Quantization", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you optimize BF16 LoRA adapter computation to eliminate the 35% latency overhead on the INT4 base model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2258", "title": "Federated LoRA Aggregation for Edge Fleet Personalization", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a communication-efficient federated averaging protocol for LoRA?", "chain_ids": ["edge-chain-auto-secondary-016-10"], "chain_positions": {"edge-chain-auto-secondary-016-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2259", "title": "LoRA Adapter Merge for Memory-Constrained Inference", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is the 3B INT4 model with an r=64 LoRA adapter failing on 4GB RAM, and how would you fix it without degrading accuracy?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2260", "title": "LoRA Fine-Tuning on Jetson Orin Unified Memory", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you configure and run LoRA training on a Jetson Orin NX without a discrete GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2265", "title": "Orin Multi-Modal Fusion Queue Instability", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the stability of the fusion queue over a 10-second operational window?", "visual": {"kind": "svg", "path": "edge-2265.svg", "alt": "A linear chart demonstrating the queue size steadily increasing over the 10 second window because service is slower than arrival.", "caption": "Unstable Queue Growth (ρ > 1)"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2266", "title": "Jetson Orin Zero-Copy Camera Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory pipeline that avoids a bus bottleneck, calculating the LPDDR5 bandwidth consumed if a naive CPU-to-GPU data copy is used vs a zero-copy pointer pass?", "visual": {"kind": "svg", "path": "edge-2266.svg", "alt": "Bar chart comparing 12 GB/s memory bandwidth usage in a naive pipeline vs 6 GB/s with zero-copy.", "caption": "Memory Bandwidth Savings (Zero-Copy)"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2271", "title": "Orin Shared LPDDR5 Deficit", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory QoS (Quality of Service) scheme to resolve the contention and calculate the resulting memory bandwidth deficit?", "visual": {"kind": "svg", "path": "edge-2271.svg", "alt": "Stacked bar chart showing 230 GB/s request vs 204.8 GB/s limit.", "caption": "Memory Bandwidth Contention."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2276", "title": "Hailo-8 Accelerator Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption over a continuous 1-second operational cycle?", "visual": {"kind": "svg", "path": "edge-2276.svg", "alt": "A square-wave timeline chart showing power spikes to 2.5W for 100ms, followed by 0.1W baseline.", "caption": "Accelerator Power Duty Cycle"}, "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2280", "title": "Edge Gateway Latency Bounds", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What maximum processor utilization rho keeps average wait time at or below 40ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2281", "title": "Edge KV Cache Capacity Sizing", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming a flat 1 MB of KV cache is required per context token, calculate the maximum number of concurrent 1024-token requests the device can sustain?", "visual": {"kind": "svg", "path": "edge-2281.svg", "alt": "A stacked bar chart showing the breakdown of the 32GB RAM into Weights, OS, and KV Cache.", "caption": "Edge Memory Allocation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2282", "title": "Edge 4-Stage Synchronous Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the theoretical latency of a single frame and the steady-state pipeline throughput in FPS?", "visual": {"kind": "svg", "path": "edge-2282.svg", "alt": "A four-stage diagonal Gantt chart showing consecutive 16ms blocks overlapping in a standard pipeline execution pattern.", "caption": "Synchronous Edge Pipeline"}, "chain_ids": ["edge-chain-auto-secondary-017-21"], "chain_positions": {"edge-chain-auto-secondary-017-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2285", "title": "Orin Mesh Bisection Bandwidth", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the bisection bandwidth of this 4-node fully connected mesh network?", "visual": {"kind": "svg", "path": "edge-2285.svg", "alt": "A complete graph of four nodes where every node is connected to every other node forming a cross inside a box.", "caption": "4-Node Fully Connected Mesh"}, "chain_ids": ["edge-chain-auto-secondary-017-05"], "chain_positions": {"edge-chain-auto-secondary-017-05": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2286", "title": "Hailo-8 Solar Camera", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how modifying the sleep/wake schedule to a 20% active duty cycle allows the Hailo-8 to meet the 0.5W budget assuming negligible sleep power?", "visual": {"kind": "svg", "path": "edge-2286.svg", "alt": "Power consumption over time switching between 2.5W and 0W.", "caption": "Hailo-8 Power Cycling"}, "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2287", "title": "Jetson Shared Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the bandwidth difference and physical separation between the CPU and GPU when accessing the shared memory?", "visual": {"kind": "svg", "path": "edge-2287.svg", "alt": "Bar chart comparing zero PCIe overhead to the shared memory bandwidth.", "caption": "Unified Memory Architecture Bandwidth"}, "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 1}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2288", "title": "Jetson Orin Federated Ring", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Recall the communication steps of a Ring AllReduce algorithm used to synchronize gradients across the 4 nodes?", "visual": {"kind": "svg", "path": "edge-2288.svg", "alt": "Ring topology diagram with 4 Jetson nodes connected in a circle.", "caption": "4-Node Ring Topology"}, "chain_ids": ["edge-chain-auto-secondary-009-19"], "chain_positions": {"edge-chain-auto-secondary-009-19": 0}, "chain_tiers": {"edge-chain-auto-secondary-009-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2290", "title": "Accelerator Intersection Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the utilization of the chip using basic queueing theory?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2292", "title": "Hailo-8 PCIe Gen 3 Limit", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the maximum theoretical bandwidth of the PCIe Gen 3 x4 interface connecting the edge host to the accelerator?", "visual": {"kind": "svg", "path": "edge-2292.svg", "alt": "Bar chart comparing 1 lane vs 4 lanes of PCIe Gen 3 bandwidth.", "caption": "PCIe Gen 3 Bandwidth"}, "chain_ids": ["edge-chain-auto-024-02"], "chain_positions": {"edge-chain-auto-024-02": 0}, "chain_tiers": {"edge-chain-auto-024-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2294", "title": "Edge Video Pipeline Queueing Delay Analysis", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the expected queueing delay utilizing M/M/1 principles given the average arrival and service rates?", "visual": {"kind": "svg", "path": "edge-2294.svg", "alt": "Hockey-stick curve showing exponential growth of queueing delay as utilization approaches 1.0.", "caption": "Queue Length vs Utilization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2296", "title": "Accelerator Model State Reload Time", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the hard downtime in seconds if the reset sequence takes 500ms before memory copying begins?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2297", "title": "Rolling Checkpoints for Drone Tracking", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how writing a compact rolling checkpoint to NVMe limits the Recovery Time Objective (RTO)?", "visual": {"kind": "svg", "path": "edge-2297.svg", "alt": "Timeline comparing a long fresh initialization vs a short checkpoint recovery.", "caption": "RTO Reduction via Checkpoint"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2298", "title": "PCIe Collective Communication Efficiency", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why Gather-Compute-Broadcast by the host CPU is more efficient here than a device-to-device Ring AllReduce?", "chain_ids": ["edge-chain-auto-secondary-009-21"], "chain_positions": {"edge-chain-auto-secondary-009-21": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-21": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2299", "title": "GMSL Camera PCIe Bottleneck on Orin", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the total required bandwidth and determine if the PCIe Gen4 x4 link will bottleneck the streams?", "visual": {"kind": "svg", "path": "edge-2299.svg", "alt": "Topology showing 4 cameras feeding into a capture card routed over a PCIe x4 bus to the Orin SoC.", "caption": "Camera to PCIe Topology"}, "chain_ids": ["edge-chain-auto-secondary-017-04"], "chain_positions": {"edge-chain-auto-secondary-017-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2300", "title": "PIR-Triggered Accelerator Power States", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the specific power mode the system and accelerator should maintain prior to the PIR trigger to maximize battery life?", "visual": {"kind": "svg", "path": "edge-2300.svg", "alt": "Timeline showing near-zero sleep power, a PIR trigger event, and an active compute spike.", "caption": "Event-Driven Power Profile"}, "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2301", "title": "Edge Ethernet Transfer Latency", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the transfer latency per frame and determine if the 1 Gbps link is sufficient?", "visual": {"kind": "svg", "path": "edge-2301.svg", "alt": "Simple point to point diagram showing two Orins connected by an Ethernet link.", "caption": "Orin Ethernet Link"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2302", "title": "Jetson Orin Zero-Copy Vision", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory-mapped zero-copy pipeline specification from the camera ISP to the GPU to ensure 4x 1080p 60FPS streams don't bottleneck the 204.8 GB/s LPDDR5 bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2303", "title": "Hailo-8 Multi-Model Packing", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the utilization of a single Hailo-8 chip and determine how many additional 1 TOPS pedestrian detection models can fit within the remaining compute budget?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 2}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2305", "title": "Orin JPEG Decode to TensorRT Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a Jetson decode-to-inference pipeline that keeps JPEG frames in NVMM and avoids CPU copies before TensorRT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2306", "title": "Hailo-8 Multi-Tenant SLA", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a model multiplexing architecture that guarantees a strict 30 FPS SLA for the customer counting model while utilizing idle cycles for the batch inventory model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2308", "title": "Evaluate effective uptime comparing different edge checkpointing intervals", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether a 1-minute checkpoint interval provides a higher effective application throughput compared to a 5-minute interval?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2309", "title": "Apply double-buffering to calculate total image processing latency", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply a double-buffering strategy to calculate the total time required to process a batch of 4 frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2310", "title": "Assess queuing stability and utilization for Hailo-8 defect inspection", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Assess the stability of this queuing system and calculate the expected utilization factor of the accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2311", "title": "Design spatial tiling to fit BEV features in SRAM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a feature map tiling strategy to execute the detection head without spilling intermediate activations to the slow LPDDR5 memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2312", "title": "Develop three-year TCO energy model for delivery robot edge compute", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Develop a 3-year TCO model comparing these two options based purely on continuous energy usage at a cost of $0.20 per kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2313", "title": "Design micro-batching strategy to minimize edge pipeline parallelism bubbles", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a micro-batching strategy to minimize pipeline bubbles, and calculate the transfer time for the first micro-batch if the activation is split into 5 chunks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2314", "title": "Specify memory mapping technique to eliminate offline voice model latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify a memory management technique to eliminate this 500ms model loading latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2315", "title": "Calculate peak power difference between data center and edge accelerators", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the difference in peak power consumption between the two hardware deployment choices (400W vs 2x 60W)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2316", "title": "Evaluate hardware encoding to resolve disk bandwidth bottlenecks on edge", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether a hardware H.265 compression step before the disk write will resolve the bottleneck, assuming 100:1 compression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2318", "title": "Analyze INT4 memory savings enabling larger batch sizes on edge", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does quantizing only the model weights to INT4 resolve the out-of-memory error for a batch size of 16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2319", "title": "Specify asynchronous network calls to prevent edge inference pipeline blocking", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Specify how asynchronous network calls improve the throughput of the local inference loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2327", "title": "Power vs Energy in Edge Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why stepping the Orin down from 60W to 15W mode might not decrease the total battery energy consumed per inference?", "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 0}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2329", "title": "Theoretical NPU Execution Time", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical minimum time required for a single inference assuming 100% compute utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2330", "title": "PCIe Bottleneck on Edge Accelerator", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why computation-communication overlap fails to keep the NPU fully utilized in this setup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2331", "title": "PTQ Accuracy Degradation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the mathematical cause of this accuracy collapse and propose a method to diagnose it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2335", "title": "Edge Factory Queue Stability", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the NPU utilization, and does the inference queue remain stable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2337", "title": "Edge Orin Memory Bandwidth Limit", "topic": "kv-cache-management", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the peak memory bandwidth required to load the KV-cache for a single decoding step at 20 tokens/sec?", "visual": {"kind": "svg", "path": "edge-2337.svg", "alt": "Horizontal bar showing 60 GB/s utilization against a 204.8 GB/s limit", "caption": "Memory Bandwidth Utilization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2340", "title": "SRAM Tiering for Generation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory tiering strategy that maximizes the utilization of the SRAM during the generation phase?", "visual": {"kind": "svg", "path": "edge-2340.svg", "alt": "Bar chart showing huge 2TB/s SRAM bandwidth vs small 100GB/s DRAM bandwidth", "caption": "Bandwidth Hierarchy Capabilities"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2342", "title": "Asymmetric Cross-Rack AllReduce on Orin Nodes", "topic": "collective-communication", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the realistic Ring AllReduce time given the asymmetric tree topology, and identify whether a 2-D hierarchical AllReduce would beat a flat 4-node ring across the contended cross-rack link?", "visual": {"kind": "svg", "path": "edge-2342.svg", "alt": "Four Jetson Orin nodes arranged in a logical ring topology.", "caption": "Ring AllReduce logical topology across four Orin nodes."}, "chain_ids": ["edge-chain-auto-secondary-009-19"], "chain_positions": {"edge-chain-auto-secondary-009-19": 2}, "chain_tiers": {"edge-chain-auto-secondary-009-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2343", "title": "Dual Hailo Pipeline FPS", "topic": "pipeline-parallelism", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate end-to-end latency and peak throughput accounting for PCIe transfer and DMA setup overhead?", "visual": {"kind": "svg", "path": "edge-2343.svg", "alt": "Gantt chart showing frames overlapping across Stage 1 and Stage 2.", "caption": "Pipeline parallelism execution schedule."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2345", "title": "Camera PCIe Fanout Topology", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the total stream bandwidth and what topology maps cameras to accelerators to prevent bus contention?", "visual": {"kind": "svg", "path": "edge-2345.svg", "alt": "Hierarchical diagram routing 8 cameras into a central PCIe switch, dividing to 4 Hailo-8 chips.", "caption": "PCIe fanout topology for high-bandwidth camera streams."}, "chain_ids": ["edge-chain-auto-secondary-017-56"], "chain_positions": {"edge-chain-auto-secondary-017-56": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-56": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2346", "title": "Orin Max Batch KV Limits", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the maximum batch size if 24GB of RAM is strictly reserved for the KV cache?", "visual": {"kind": "svg", "path": "edge-2346.svg", "alt": "Bar chart comparing 24GB KV cache capacity to individual 2.14GB request footprints.", "caption": "Memory footprint scaling for KV cache batch size."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2349", "title": "IP Camera Gigabit Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "edge", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the risk of packet drop on the 1 Gigabit Ethernet port and propose a network-level fix?", "visual": {"kind": "svg", "path": "edge-2349.svg", "alt": "8 cameras funneling into a Gigabit switch pointing to an Orin node, with a bottleneck indicator.", "caption": "Gigabit Ethernet bottleneck for uncompressed video streams."}, "chain_ids": ["edge-chain-auto-secondary-017-56"], "chain_positions": {"edge-chain-auto-secondary-017-56": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-56": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2350", "title": "Drone Navigation RTO", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the Recovery Time Objective (RTO) constraint if the system must resume control within 2 seconds of a transient glitch?", "visual": {"kind": "svg", "path": "edge-2350.svg", "alt": "Timeline showing a power glitch followed by a 2-second recovery window before flight resumes.", "caption": "Drone recovery time objective timeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2351", "title": "PagedAttention Fragmentation", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the reduction in internal memory fragmentation if the KV page size is reduced from 256 tokens to 16 tokens?", "visual": {"kind": "svg", "path": "edge-2351.svg", "alt": "Bar chart contrasting the maximum wasted tokens between 256-token pages and 16-token pages.", "caption": "Maximum internal fragmentation waste per sequence."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2352", "title": "Orin Memory Bandwidth Wall", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the memory bandwidth requirement for this generation speed and determine if the Orin can support it?", "visual": {"kind": "svg", "path": "edge-2352.svg", "alt": "Bar chart showing demanded 280 GB/s bandwidth overshooting the 204.8 GB/s limit of the Orin.", "caption": "Bandwidth limit violation during fast token generation."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2354", "title": "Jetson AGX Orin Queueing Latency Spike", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why latency on the Orin diverges from textbook M/M/1 predictions as utilization approaches 100%?", "visual": {"kind": "svg", "path": "edge-2354.svg", "alt": "A curve showing latency staying low until utilization hits about 80%, after which it spikes upward.", "caption": "M/M/1 Queueing delay demonstrating the hockey stick effect."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2355", "title": "Orin Memory Hierarchy Bandwidth Comparison", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the theoretical maximum memory bandwidth of the AGX Orin's LPDDR5 and compare its order of magnitude to standard L1 cache bandwidth?", "visual": {"kind": "svg", "path": "edge-2355.svg", "alt": "Bar chart comparing L1 Cache, L2 Cache, and LPDDR5 bandwidths.", "caption": "AGX Orin typical memory tier bandwidth comparison."}, "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 0}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2357", "title": "PagedAttention Block Size Fragmentation", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Starting from the fragmentation comparison shown, explain why the fragmentation argument inverts on this iGPU once you account for block-table entries, and recommend a block size?", "visual": {"kind": "svg", "path": "edge-2357.svg", "alt": "Bar chart showing high internal fragmentation for 256-token blocks compared to 16-token blocks.", "caption": "KV Cache Internal Fragmentation Ratio."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2358", "title": "Jetson AGX Orin Duty-Cycle Energy Calculation", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply duty-cycling formulas to calculate the total energy consumed in Joules over exactly 1 hour?", "visual": {"kind": "svg", "path": "edge-2358.svg", "alt": "A step wave oscillating between 15W for brief periods and 5W for long periods.", "caption": "Active vs Idle Power over Time."}, "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 1}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2359", "title": "AGX Orin LPDDR5 Effective Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why does practical bandwidth utilization collapse under random memory access compared to contiguous streaming?", "visual": {"kind": "svg", "path": "edge-2359.svg", "alt": "Bar chart comparing sequential 160 GB/s against random access 30 GB/s with a red dashed line at peak 204.8 GB/s.", "caption": "Effective Bandwidth vs Access Pattern."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2360", "title": "Orin NVMe Interconnect Standard", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Identify the primary high-speed interconnect topology standard used by the Jetson AGX Orin to attach local NVMe storage?", "visual": {"kind": "svg", "path": "edge-2360.svg", "alt": "Block diagram showing the Orin SoC connected to NVMe via PCIe Gen4.", "caption": "PCIe Gen4 x4 NVMe Interconnect Topology."}, "chain_ids": ["edge-chain-auto-secondary-017-04"], "chain_positions": {"edge-chain-auto-secondary-017-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2361", "title": "Dual-Tier Edge Checkpoint Formulation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a dual-tier checkpointing mechanism that satisfies both local storage wear limits and narrow remote bandwidth constraints?", "visual": {"kind": "svg", "path": "edge-2361.svg", "alt": "Graph showing small frequent local checkpoints and one large rare cloud checkpoint.", "caption": "Dual-Tier Checkpointing Profile."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2362", "title": "KV Cache Pre-allocation Memory Waste", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the maximum batch size before OOM under static max-length allocation versus dynamic paged allocation?", "visual": {"kind": "svg", "path": "edge-2362.svg", "alt": "Bar chart comparing high memory allocation for Static Max against low usage for dynamic paging.", "caption": "KV Cache Allocation Efficiency: Static vs Paged."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2363", "title": "Orin YOLOv8 Queue Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Using M/D/1, what is mean frame response time, how does it compare with M/M/1, and what latency is saved?", "visual": {"kind": "svg", "path": "edge-2363.svg", "alt": "A hockey-stick curve showing latency skyrocketing as utilization approaches 1.0.", "caption": "M/M/1 Latency vs Utilization."}, "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 2}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2364", "title": "Hailo-8 Camera Fan-in Link", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Compute the aggregate ingress bandwidth for 4 and 6 cameras, identify which stage saturates first, and explain the order of binding constraints?", "visual": {"kind": "svg", "path": "edge-2364.svg", "alt": "Four cameras feeding into a single saturated network switch.", "caption": "4:1 Fan-in network topology causing congestion."}, "chain_ids": ["edge-chain-auto-secondary-017-56"], "chain_positions": {"edge-chain-auto-secondary-017-56": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-56": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2365", "title": "Orin Power Mode Threshold", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the energy consumed per inference in each mode, and which is more energy-efficient?", "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 2}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2367", "title": "Hailo-8 Dual Model Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Using the overlapping pipeline as a baseline, what are the actual latency and steady-state throughput of the single-fabric multi-context execution?", "visual": {"kind": "svg", "path": "edge-2367.svg", "alt": "Bubble Gantt chart showing stage 1 (10ms) overlapping with stage 2 (15ms).", "caption": "Pipelined model execution on Edge TPU."}, "chain_ids": ["edge-chain-auto-secondary-017-21"], "chain_positions": {"edge-chain-auto-secondary-017-21": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2368", "title": "Orin 4K Memory Bandwidth", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the percentage of total memory bandwidth consumed by writing these frames against the LPDDR5 limit (204.8 GB/s)?", "visual": {"kind": "svg", "path": "edge-2368.svg", "alt": "Bar chart comparing massive 204.8 GB/s capacity against tiny 1.5 GB/s camera write.", "caption": "Camera write bandwidth vs total LPDDR5 capacity."}, "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 1}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2369", "title": "Hailo-8 SSD RTO Bottleneck", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the Recovery Time Objective (RTO) limit imposed by the SSD speed?", "visual": {"kind": "svg", "path": "edge-2369.svg", "alt": "Timeline showing 2 seconds of recovery time.", "caption": "RTO bound by SSD read speed."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2370", "title": "Edge Multi-Camera Sizing", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What TOPS and LPDDR5 bandwidth are required, is the system compute- or bandwidth-bound, and which Orin power mode suffices?", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 4}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2371", "title": "M/M/1 Robotics Processing", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the average wait time in the queue, and determine if the total system latency meets a 100ms end-to-end deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2372", "title": "Hailo-8 Zero-Copy Gateway", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Construct an end-to-end multi-process serving architecture that maintains the 2.5W power envelope while moving data from the video decoder to the Hailo engine?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2373", "title": "Priority Queue Multi-Camera Scheduling", "topic": "queueing-theory", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can the scheduling model be optimized to ensure safety-critical cameras bypass FIFO without starving the other 8 cameras?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 4}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2374", "title": "Triton Dynamic Batching on Edge", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a multi-tenant serving architecture using NVIDIA Triton to allocate GPU memory effectively and implement dynamic batching?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 4}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2375", "title": "Hailo-8 Batching vs Latency", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "edge", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze whether buffering 10 frames into a single batch is more energy-efficient than processing each frame individually as it arrives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2376", "title": "DETR Attention Tiling Edge", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Analyze the LPDDR5 memory traffic for the self-attention maps and create a tiling strategy to prevent spilling out of the Orin's L2 cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2377", "title": "Edge Device Checkpointing Under Variable Power Modes", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the trade-offs of asynchronous versus synchronous local NVMe checkpointing for a 2GB active map state under 204.8 GB/s memory bandwidth constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2378", "title": "Asynchronous Double-Buffering for Edge Bounding Box Tracking", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply a double-buffering scheme to overlap the host's bounding box decoding with the accelerator's INT8 convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2379", "title": "Mixed-Precision Quantization Aware Training for Edge GPUs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a custom quantization-aware training scheme that targets INT8 tensor cores while selectively keeping sensitive depthwise layers in FP16?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2380", "title": "Zero-Copy DMA Pipeline for Accelerated Edge Video Streams", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a hardware-accelerated pipeline specification that offloads decoding to a VPU and uses zero-copy memory buffers?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 4}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2381", "title": "Dual-Bank Atomic Checkpointing on Edge Storage Media", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate a dual-bank checkpointing mechanism where the Orin writes to an A/B partition on the eMMC over the alternative of simply journaling gradients?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2382", "title": "Overlapping Autoregressive Generation with Wi-Fi Transmission", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify how to overlap the autoregressive INT8 token generation step with the network transmission of previous tokens to minimize inter-token latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2383", "title": "Energy Costs of Accelerator Initialization Versus Inference", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the total energy cost per positive detection if the accelerator requires 500ms to boot and load weights over PCIe before it can perform a 20ms inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2384", "title": "Hardware Video Decoder Offloading in Edge ML Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Summarize the role of the NVDEC hardware decoder in the data ingestion pipeline before the tensor cores execute the model?", "chain_ids": ["edge-chain-auto-023-10"], "chain_positions": {"edge-chain-auto-023-10": 0}, "chain_tiers": {"edge-chain-auto-023-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2385", "title": "Asynchronous Gradient Clipping for Unstable Edge Connections", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the use of asynchronous gradient clipping and local momentum buffering to maintain training stability when a node reconnects after missing 3 synchronization rounds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2386", "title": "Deterministic M/D/1 Queuing Delay for Edge Camera Streams", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the system as an M/D/1 queue to find the average wait time in the buffer before processing begins?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2389", "title": "Edge Autoregressive Bandwidth", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how to estimate the minimum memory bandwidth required to generate 15 tokens per second for a single user?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2391", "title": "Jetson Multi-Model Triton Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Design a serving architecture to maximize GPU utilization and handle varying frame rates among the 5 concurrent models?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 2}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2392", "title": "Hailo-8 PIR Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy efficiency of using a small PIR motion sensor to trigger the Hailo-8 versus keeping the Hailo-8 continuously active at 1 FPS?", "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 4}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2394", "title": "Multi-Camera Batch Serving", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Identify the serving technique that combines independent frames into a single batch, and quantify the latency-vs-throughput trade-off to meet the 33ms SLA?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 0}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2396", "title": "Jetson Orin VIO Power Bottleneck Diagnosis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a diagnostic framework to pinpoint the compute bottlenecks and propose an architectural modification to keep the power strictly under 30W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2398", "title": "INT8 Requantization Fallback", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What quantization-related alignment issue causes a residual connection addition to fail or perform poorly on an integer-only accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2399", "title": "Orin Zero-Copy Cropping", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you pass an 800x600 crop to the GPU without CPU memcpy when the source pitch differs from the tensor layout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2402", "title": "Orin Thermal Throttle Floor for SLA", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Determine whether the thermally-throttled 30W envelope meets the 30 FPS SLA, and recommend whether thermal mitigation or model trimming is the cheaper fix?", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 0}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2404", "title": "Gateway Batch Pipeline Utilization", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the maximum effective throughput and utilization of the Orin pipeline to determine if it can sustain the gateway arrival rate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2406", "title": "PCIe vs Compute Bind-Flip Under Thermal Throttle", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the steady-state FPS budget by computing the PCIe-bound and compute-bound FPS in both cold and warm regimes, and identify the binding constraint in each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2408", "title": "Drone Edge Compute Energy Penalty", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the total extra energy in Joules consumed by executing the mapping operation compared to just remaining in the 15W idle state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2409", "title": "Industrial Camera LPDDR5 DMA Tax", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Does camera DMA, model weight reads, or activation traffic starve the others under aggregate LPDDR5 utilization?", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 5}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2410", "title": "Evaluating Flash Storage Endurance for Edge Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the feasibility of this checkpointing strategy regarding storage lifespan and compute interruption, given the hardware specs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2411", "title": "Calculating Communication Hiding Feasibility on PCIe Edge", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the minimum network transfer time per feature map and determine if the communication can be perfectly hidden behind the computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2413", "title": "Specifying Hardware Accelerated CV Pipelines on Edge", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you specify a hardware-accelerated pipeline path to eliminate the CPU bottleneck for this workload?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 2}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2414", "title": "Designing Supercapacitor Emergency State Flushes for Edge", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an emergency checkpointing mechanism utilizing the supercapacitor, calculating if it holds enough energy to flush 50 MB of state to an NVMe drive writing at 500 MB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2415", "title": "Evaluating Serialized Asynchronous CUDA Copies on an Edge SoC", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why the memory copies might still serialize with computation despite using asynchronous CUDA APIs?", "chain_ids": ["edge-chain-auto-secondary-016-04"], "chain_positions": {"edge-chain-auto-secondary-016-04": 1}, "chain_tiers": {"edge-chain-auto-secondary-016-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2416", "title": "Coral TPU Batching Effect on Queue Stability", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether batch-of-1, batch-of-2, or batch-of-4 best satisfies the 60 fps arrival rate at the 80 ms p95 SLA using M/D/1 queueing theory?", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 4}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2417", "title": "Calculating Maximum Framerate from Accelerator TOPS Capacity", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How do you calculate the theoretical maximum framerate the Hailo-8 can achieve, and what is the relationship between TOPS and GOPS?", "chain_ids": ["edge-chain-auto-026-13"], "chain_positions": {"edge-chain-auto-026-13": 0}, "chain_tiers": {"edge-chain-auto-026-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2419", "title": "Analyzing Inter-Process Communication Overheads in Video Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does the Python multiprocessing queue impact memory bandwidth and latency for this 4K video pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2420", "title": "Transient Queue Buildup from Stochastic Arrivals", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the system require a queue buffer even though the maximum processing rate (40 FPS) safely exceeds the average arrival rate (30 FPS)?", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 1}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2422", "title": "Flash Wearout vs. Recovery Time in Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Describe how the 5-minute checkpoint interval balances physical hardware constraints against the system's Recovery Time Objective (RTO)?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2424", "title": "TensorRT PTQ Calibration on Orin with 200 Scenes", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate which TensorRT INT8 calibrator best fits the 200-scene constraint, and what concrete TensorRT mitigations are available before resorting to QAT?", "chain_ids": ["edge-chain-auto-019-09"], "chain_positions": {"edge-chain-auto-019-09": 2}, "chain_tiers": {"edge-chain-auto-019-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2425", "title": "Heuristic Priority Queuing for Edge Vision", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can you apply a dynamic queueing mechanism to minimize overall dropped frames of high-value events without upgrading hardware?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 1}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2426", "title": "Zero-Copy Memory on Unified Edge Devices", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is a 'zero-copy' memory buffer, and why does it drastically reduce latency on unified memory architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2427", "title": "Dynamic Batching Inefficiencies in TensorRT", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why dynamically sending a batch size of 3 might result in sub-optimal latency compared to an engine compiled specifically for batch 3?", "chain_ids": ["edge-chain-auto-022-11"], "chain_positions": {"edge-chain-auto-022-11": 1}, "chain_tiers": {"edge-chain-auto-022-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2428", "title": "Symmetric INT8 Quantization Formula", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the standard mathematical formula used to map an FP32 value to a symmetric INT8 representation?", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 0}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2429", "title": "Hailo-8 Mixed Precision INT8 Clipping Diagnosis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a mixed-precision diagnostic strategy to restore mAP without falling below the 20 TOPS throughput requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2430", "title": "D/G/1 Jetson Buffer Limits", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the system's queue dynamics to determine if an infinite buffer is necessary or if a finite drop-policy is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2431", "title": "Jetson Zero-Copy Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why standard CPU memory buffers are highly inefficient on this platform and name a better primitive?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 3}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2432", "title": "Supercapacitor Graceful Shutdown", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "During the 5-second shutdown window, which subsystems must checkpoint and in what canonical order?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2433", "title": "TensorRT INT8 Speedup Bound", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical maximum speedup of a purely compute-bound matrix multiplication moving from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2434", "title": "Drop-Oldest Queue Policy", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why implementing a LIFO or drop-oldest queueing policy improves real-time security alerts?", "chain_ids": ["edge-chain-auto-018-07"], "chain_positions": {"edge-chain-auto-018-07": 0}, "chain_tiers": {"edge-chain-auto-018-07": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2436", "title": "PCIe Double-Buffering for Edge Accelerators", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the transfer latency, and can a 2-frame double buffer fully overlap communication with computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2441", "title": "Diagnosing Attention Collapse in INT8 ViTs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L6+", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the specific mathematical operation in the transformer architecture that fails under uniform INT8 and propose an architectural precision override?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2442", "title": "Evaluate Accelerator Mismatch for Autoregressive LLMs", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create an end-to-end evaluation of this deployment architecture and identify the fundamental hardware mismatch preventing real-time text generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2444", "title": "Orin SSD Wear versus Recovery Time", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-off between checkpointing every 1 minute versus every 10 minutes regarding SSD wear-out time and worst-case recovery compute?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 3}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2445", "title": "Orin DMA and Inference Pipelining", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an overlapping execution schedule using CUDA streams and DMA engines to maximize total system frames-per-second?", "chain_ids": ["edge-chain-auto-secondary-016-04"], "chain_positions": {"edge-chain-auto-secondary-016-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-016-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2447", "title": "Hailo-8 Inference Latency Estimate", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Estimate the single-frame inference latency in milliseconds assuming the compiler achieves a hardware utilization efficiency of precisely 25%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2448", "title": "Orin Parameter Bandwidth", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the minimum memory bytes read from LPDDR5 strictly to load the weights for one full forward pass if stored in FP16 versus INT8?", "chain_ids": ["edge-chain-auto-019-11"], "chain_positions": {"edge-chain-auto-019-11": 0}, "chain_tiers": {"edge-chain-auto-019-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2449", "title": "Camera Batching Latency Evaluation", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether Batch=1 or Batch=4 minimizes the end-to-end tail latency for the worst-case camera frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2457", "title": "Energy Calculation for Drone Collision Avoidance on Hailo-8", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total energy consumed by the Hailo-8 accelerator over 1 minute of active flight?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2459", "title": "Latency Reduction via GPU Unified Memory Data Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how replacing the CPU alignment step with a custom CUDA kernel impacts the overall pipeline latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2460", "title": "Memory Bandwidth Limits on Orin Vision", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Evaluate why the Orin might still fail to hit 30 FPS despite having 5.5x the required peak compute TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2462", "title": "Layer Fusion for Hailo-8 SRAM Savings", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply layer fusion to reduce the peak memory footprint of the activation maps during inference?", "chain_ids": ["edge-chain-auto-024-02"], "chain_positions": {"edge-chain-auto-024-02": 1}, "chain_tiers": {"edge-chain-auto-024-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2463", "title": "Symmetric Zero-Point Quantization on Edge", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What zero-point value does symmetric INT8 quantization use, and what hardware overhead does it avoid on edge NPUs?", "chain_ids": ["edge-chain-auto-019-10"], "chain_positions": {"edge-chain-auto-019-10": 2}, "chain_tiers": {"edge-chain-auto-019-10": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2466", "title": "Cold Boot Power Penalties on Edge Devices", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the power traces showing a 2-second high-power phase before every inference to diagnose the flaw in the duty-cycling logic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2467", "title": "Dynamic Range Quantization for Memory Bandwidth", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "edge", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply dynamic range quantization to the model and explain which operations still run in FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2469", "title": "Constructing Edge Duty-Cycling Power Budgets", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Construct a duty-cycling strategy that meets a daily budget of 75Wh while processing each trigger with a 5-second inference window?", "chain_ids": ["edge-chain-auto-secondary-014-10"], "chain_positions": {"edge-chain-auto-secondary-014-10": 3}, "chain_tiers": {"edge-chain-auto-secondary-014-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2472", "title": "Applying Zero-Copy Unified Memory Architectures", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why is Zero-Copy advantageous for feeding camera input directly to the GPU in a Unified Memory Architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2475", "title": "Analyzing Memory Bus Contention Between Subsystems", "topic": "communication-computation-overlap", "competency_area": "latency", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the impact of a unified memory bottleneck if the DMA controller and the GPU compete for LPDDR5 simultaneously?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2476", "title": "Compute Allocation Plans for Heterogeneous Models", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a compute resource allocation plan and determine the system's expected utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2478", "title": "Drone Pre-Processing Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the data pipeline bottlenecks between memory, CPU, and GPU to determine why the accelerator is starved?", "chain_ids": ["edge-chain-auto-023-09"], "chain_positions": {"edge-chain-auto-023-09": 4}, "chain_tiers": {"edge-chain-auto-023-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2479", "title": "Dynamic Model Multiplexing", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a dynamic model multiplexing infrastructure that ensures zero downtime during product line switchovers while maintaining the 2.5W power envelope?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2480", "title": "Wildlife Camera Energy Waste", "topic": "duty-cycling", "competency_area": "power", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the total energy consumption per hour and determine the percentage of energy wasted during the idle periods?", "chain_ids": ["edge-chain-auto-secondary-014-09"], "chain_positions": {"edge-chain-auto-secondary-014-09": 3}, "chain_tiers": {"edge-chain-auto-secondary-014-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2481", "title": "Realistic Orin Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "edge", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply hardware efficiency factors to estimate the realistic maximum frame rate the device can achieve, assuming a 30% hardware utilization rate?", "chain_ids": ["edge-chain-auto-026-14"], "chain_positions": {"edge-chain-auto-026-14": 2}, "chain_tiers": {"edge-chain-auto-026-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2482", "title": "Edge Server Queue Depth", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the utilization of the accelerator and specify the required queue length to hold the average number of queued requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2483", "title": "Vision Transformer Memory Spillage", "topic": "memory-hierarchy-design", "competency_area": "architecture", "track": "edge", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why memory bandwidth bottlenecks a 50-GFLOP Vision Transformer despite the 204.8 GB/s limit, and propose a memory-fusion strategy?", "chain_ids": ["edge-chain-auto-024-01"], "chain_positions": {"edge-chain-auto-024-01": 4}, "chain_tiers": {"edge-chain-auto-024-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2484", "title": "Edge ID Persistence", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate a strategy to maintain continuous tracking IDs across reboots without writing high-frequency video data to the SD card?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2485", "title": "CUDA Stream Scheduling", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "edge", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify the scheduling timeline using CUDA streams to achieve maximum throughput, and calculate the time to process 10 batches?", "chain_ids": ["edge-chain-auto-secondary-016-04"], "chain_positions": {"edge-chain-auto-secondary-016-04": 0}, "chain_tiers": {"edge-chain-auto-secondary-016-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2487", "title": "Hailo-8 M/D/1 Queuing", "topic": "queueing-theory", "competency_area": "latency", "track": "edge", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Compute the expected queue length assuming an M/D/1 queueing model for the inference requests?", "chain_ids": ["edge-chain-auto-018-06"], "chain_positions": {"edge-chain-auto-018-06": 0}, "chain_tiers": {"edge-chain-auto-018-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2489", "title": "Edge Drone WAL Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "edge", "level": "L6+", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How effective is an A/B ping-pong checkpoint strategy combined with write-ahead logging (WAL) for preventing state corruption?", "chain_ids": ["edge-chain-auto-024-04"], "chain_positions": {"edge-chain-auto-024-04": 4}, "chain_tiers": {"edge-chain-auto-024-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2493", "title": "PCIe Video Streaming Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "edge", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze whether the PCIe bidirectional bandwidth becomes a bottleneck for streaming these uncompressed frames to the A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2504", "title": "10GbE Pipeline Microbatch Sizing", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the minimal microbatch size m needed to keep the pipeline bubble overhead strictly below 15%?", "chain_ids": ["edge-chain-auto-secondary-017-22"], "chain_positions": {"edge-chain-auto-secondary-017-22": 0}, "chain_tiers": {"edge-chain-auto-secondary-017-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2506", "title": "PCIe Arbitration Jitter Buffer", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the minimum asynchronous queue depth required between stages to mathematically hide this jitter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2508", "title": "BLE Connection Sync Buffering", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "edge", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a pipelined microbatching strategy to handle the BLE latency without stalling the 10ms compute stages?", "chain_ids": ["edge-chain-auto-secondary-017-22"], "chain_positions": {"edge-chain-auto-secondary-017-22": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2509", "title": "Wi-Fi 6 UAV AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the minimum transmission time required for the AllReduce, given the half-duplex shared medium constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2510", "title": "ESP-NOW Tree AllReduce", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the synchronization latency, assuming sequential hops due to the shared wireless medium?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2511", "title": "1GbE Orin AllGather Sync", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the total latency of the Ring AllGather operation across the four nodes?", "chain_ids": ["edge-chain-auto-secondary-009-19"], "chain_positions": {"edge-chain-auto-secondary-009-19": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2513", "title": "CAN Bus Protocol Framing Penalty", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the effective data transfer time for 3 nodes sequentially sending to 1 central parameter server node?", "chain_ids": ["edge-chain-auto-secondary-009-20"], "chain_positions": {"edge-chain-auto-secondary-009-20": 1}, "chain_tiers": {"edge-chain-auto-secondary-009-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2514", "title": "Hierarchical Robot Wi-Fi 6 Sync", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the total execution time for a Local Reduce -> Global AllReduce -> Local Broadcast topology?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2515", "title": "Low-Rate Satellite Mesh Gossip Protocol", "topic": "collective-communication", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a collective topology that minimizes synchronous barrier stalls under this high-latency, lossy environment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2518", "title": "NPU Preemption Context Switch", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What latency bubble comes from writing out and reading in a 20 MB NPU context at 50 GB/s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2519", "title": "Unified Memory KV Read Bandwidth", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the memory bandwidth utilization consumed purely by the KV cache during generation at 20 tokens per second?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2520", "title": "Prefix Caching TTFT Bubble", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the pipeline bubble (TTFT delay) created by synchronizing the new tokens with the cached state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2522", "title": "UFS 4.0 Storage KV Offload", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the pipeline stall time introduced by retrieving this offloaded context block?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2523", "title": "Multimodal KV Ring Buffer", "topic": "kv-cache-management", "competency_area": "architecture", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you create a KV cache memory strategy that prevents Unified Memory OOM without causing CPU garbage collection stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2525", "title": "PCIe Switch Oversubscription", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the bandwidth bottleneck and synchronization delay of this concurrent DMA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2526", "title": "1GbE Star Topology Collision Domain", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the total transfer time, accounting for the collision domain bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2527", "title": "RP2040 Dual-Core SRAM Contention", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the synchronization overhead per 1000 audio samples if accesses align perfectly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2528", "title": "CSI-2 Direct vs PCIe Switched Latency", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the added synchronization overhead of the store-and-forward FPGA topology per frame?", "chain_ids": ["edge-chain-auto-secondary-017-04"], "chain_positions": {"edge-chain-auto-secondary-017-04": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2529", "title": "Jetson Cluster Ring vs Star Broadcast", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the synchronization latency penalty of using a Ring instead of a Star topology for a one-to-all broadcast?", "chain_ids": ["edge-chain-auto-secondary-017-05"], "chain_positions": {"edge-chain-auto-secondary-017-05": 1}, "chain_tiers": {"edge-chain-auto-secondary-017-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2531", "title": "PCIe Switch AllGather Schedule for Jetson Orin", "topic": "interconnect-topology", "competency_area": "networking", "track": "edge", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an interconnect topology and collective routing scheme to minimize tail latency, accounting for the PCIe switch overheads and Orin's LPDDR5 bandwidth limits?", "chain_ids": ["edge-chain-auto-secondary-017-05"], "chain_positions": {"edge-chain-auto-secondary-017-05": 2}, "chain_tiers": {"edge-chain-auto-secondary-017-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2533", "title": "Demand Paging for Edge Model Deployment", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "edge", "level": "L2", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does mmap with unified memory solve this, and what are the latency implications?", "chain_ids": ["edge-chain-auto-secondary-008-25"], "chain_positions": {"edge-chain-auto-secondary-008-25": 1}, "chain_tiers": {"edge-chain-auto-secondary-008-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2534", "title": "KV-Cache-Aware Load Balancing at the Edge", "topic": "load-balancing", "competency_area": "deployment", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are multi-turn conversations slower with round-robin routing across 4 edge nodes, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2536", "title": "Diagnosing Zero Latency Gains from Unstructured Pruning on Coral TPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the 75% unstructured sparsity fail to yield any latency improvements on the Coral Edge TPU, and what architectural characteristic of the accelerator dictates this outcome?", "chain_ids": ["edge-chain-auto-001-01"], "chain_positions": {"edge-chain-auto-001-01": 2}, "chain_tiers": {"edge-chain-auto-001-01": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "edge-2540", "title": "The Attention Bandwidth Bottleneck", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "edge", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the SoC's compute drastically underutilized during the MHA layers, causing a deadline miss despite the low total FLOP count?", "chain_ids": ["edge-chain-auto-001-06"], "chain_positions": {"edge-chain-auto-001-06": 3}, "chain_tiers": {"edge-chain-auto-001-06": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0000", "title": "The HBM vs L1 Latency Gap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Roughly how much slower is accessing HBM3 memory compared to an L1 register read?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0001", "title": "The Energy Tax of Data Movement", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which operation consumes more energy: performing an FP16 multiply-add or reading the operands from DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0002", "title": "The FP16 Model Footprint", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How much VRAM does it occupy just to load the weights in FP16 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0003", "title": "The Ridge Point Logic", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the ridge point of this accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0004", "title": "The FP16 vs INT8 Precision Choice", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do we move to 8-bit integers for deployment?", "chain_ids": ["global-chain-auto-secondary-017-13"], "chain_positions": {"global-chain-auto-secondary-017-13": 0}, "chain_tiers": {"global-chain-auto-secondary-017-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0005", "title": "The FLOPS vs Time Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "If your GPU has a peak performance of 100 TFLOPS, what is the theoretical minimum time to finish this operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0006", "title": "The Battery Drain Math", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How many hours of continuous inference could you theoretically run on this 15 Watt-hour battery?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0007", "title": "The Embedding OOM Screen", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Before you even look at the code, what basic math did you fail to do?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["You forgot to set PyTorch's `max_split_size_mb` configuration.", "The Adam optimizer's momentum states consume 3x the memory of the weights.", "100M embeddings at FP32 (128-dim) requires 51.2GB, which physically exceeds the 16GB VRAM.", "The PCIe Gen3 bus is too slow to transfer the embeddings in time."], "correct_index": 2}}, {"id": "global-0009", "title": "The PCIe Bandwidth Screen", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the theoretical peak bandwidth of a PCIe Gen4 x16 slot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.5 TB/s", "~32 GB/s", "~10 Gbps", "~400 Gbps"], "correct_index": 1}}, {"id": "global-0010", "title": "The PyTorch DataLoader Deadlock", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why will setting num_workers=0 make the Senior Engineers instantly reject your code for a production environment?", "chain_ids": ["global-chain-auto-secondary-017-23"], "chain_positions": {"global-chain-auto-secondary-017-23": 0}, "chain_tiers": {"global-chain-auto-secondary-017-23": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It forces PyTorch to use FP64 instead of FP32.", "It causes PyTorch to spawn too many zombie processes.", "It forces synchronous data loading on the main thread, starving the GPU.", "It disables the L1 cache on the CPU."], "correct_index": 2}}, {"id": "global-0011", "title": "The Cost of Data Movement", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which operation consumes significantly more energy on a modern accelerator: performing the FP16 multiply-add operation or reading the operands from main memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The FP16 multiply-add consumes about 10x more energy.", "They consume roughly the same amount of energy.", "Reading from main memory consumes ~100x to 1000x more energy.", "Compute consumes more energy only if batch size is exactly 1."], "correct_index": 2}}, {"id": "global-0013", "title": "The Parameter Memory Footprint", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If the weights are stored in FP16, what is the absolute minimum GPU memory required just to hold the weights?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 0}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.5 GB", "~7 GB", "~14 GB", "~28 GB"], "correct_index": 2}}, {"id": "global-0014", "title": "The KV-Cache Bottleneck", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What memory structure is the primary cause of this slowdown?", "chain_ids": ["global-chain-auto-secondary-011-33"], "chain_positions": {"global-chain-auto-secondary-011-33": 0}, "chain_tiers": {"global-chain-auto-secondary-011-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The L1 Instruction Cache", "The Parameter Server", "The Gradient Checkpoint buffer", "The Key-Value (KV) Cache"], "correct_index": 3}}, {"id": "global-0015", "title": "Quantization Basics", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Why is INT8 quantization so popular for deploying models on edge devices like mobile phones?", "chain_ids": ["global-chain-auto-secondary-014-14"], "chain_positions": {"global-chain-auto-secondary-014-14": 0}, "chain_tiers": {"global-chain-auto-secondary-014-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It reduces memory bandwidth pressure by 4x and allows the use of highly energy-efficient integer ALUs.", "It automatically sparsifies the network, dropping 75% of the parameters to save memory.", "It increases the mathematical precision of the final output layer by removing floating-point noise.", "It maps 32-bit floats to 8-bit floats, keeping the same numerical distribution but running faster."], "correct_index": 0}}, {"id": "global-0016", "title": "The Purpose of the Roofline Model", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What does this tell you about your system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is CPU-Bound due to slow instruction dispatch.", "The workload is Memory-Bound (Bandwidth constrained).", "The workload is Compute-Bound (ALU constrained).", "The hardware is experiencing thermal throttling on the memory bus."], "correct_index": 1}}, {"id": "global-0017", "title": "Network Topologies", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do we use network topologies like Fat-Tree (Clos) instead of a simple traditional Star or Ring network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It is the only topology supported by PCIe Gen5.", "It provides high, non-blocking bisection bandwidth across the entire cluster.", "It allows GPUs to share a single unified L2 cache.", "It eliminates the need for network switches entirely."], "correct_index": 1}}, {"id": "global-0018", "title": "Data Parallelism vs Model Parallelism", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Which distributed strategy should you use to efficiently scale training across 8 GPUs?", "chain_ids": ["global-chain-auto-secondary-017-58"], "chain_positions": {"global-chain-auto-secondary-017-58": 0}, "chain_tiers": {"global-chain-auto-secondary-017-58": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Tensor Parallelism: Shards the 14GB model into 1.75GB pieces, wasting bandwidth on communication.", "Pipeline Parallelism: Divides the model into 8 stages of 1.75GB each, introducing severe pipeline bubbles.", "Data Parallelism: Replicates the 14GB model 8 times (112GB total), achieving near-linear 8x speedup.", "Expert Parallelism: Routes tokens to 8 different 14GB experts, eliminating communication overhead."], "correct_index": 2}}, {"id": "global-0019", "title": "SRAM vs DRAM Characteristics", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why is SRAM typically used for on-chip buffers instead of DRAM in deep learning accelerators, despite its lower density?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["DRAM has higher latency but significantly higher bandwidth per pin than SRAM.", "SRAM is denser and allows for larger on-chip memory capacity compared to DRAM.", "SRAM provides lower latency and higher bandwidth without the need for periodic refresh cycles, unlike DRAM.", "DRAM is volatile while SRAM is non-volatile, making SRAM better for persistent weights."], "correct_index": 2}}, {"id": "global-0020", "title": "Arithmetic Intensity & Roofline Model", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "In the context of the Roofline Model, what does it mean if a specific layer in a neural network has very low arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The operation is compute-bound because 312 TFLOPS / 4 FLOPs = 78 trillion elements per second.", "The operation performs very few computations per byte of memory accessed, meaning it is strictly memory bandwidth bound.", "The operation achieves 312 TFLOPS because 0.5 FLOPs/Byte is less than the 156 FLOPs/Byte ridge point.", "The layer is bounded by the 40MB L2 cache since 8 bytes * 4 = 32 bytes exceeds cache capacity."], "correct_index": 1}}, {"id": "global-0021", "title": "High Bandwidth Memory (HBM) Architecture", "topic": "extreme-quantization", "competency_area": "memory", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does the bandwidth of one HBM3 stack compare to one GDDR6 chip, and why?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Local TinyML detection with event-based transmission (0.14 Wh/day).", "Continuous cloud streaming detection (4.80 Wh/day).", "Local TinyML detection without transmission (4.80 Wh/day).", "Event-based transmission bypassing MCU (0.12 Wh/day)."], "correct_index": 0}}, {"id": "global-0022", "title": "Cache Thrashing & Matrix Operations", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What architectural issue likely causes severe slowdown and high L2 miss rates in a custom CUDA matmul kernel, and how would you resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 67MB matrices perfectly fit into the 40MB L2 cache, meaning the misses are a profiling artifact.", "Poor spatial locality is causing cache lines to be evicted before they can be fully utilized; breaking into 4KB tiles resolves this.", "The 4096-thread block size exceeds the L2 capacity of 32 threads, causing automatic bypass.", "The memory controller limits L2 bandwidth to 67MB/s, requiring execution throttling."], "correct_index": 1}}, {"id": "global-0023", "title": "Memory Hierarchy Latency Profiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the correct fastest-to-slowest order of the memory hierarchy when profiling data movement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["L1 Cache -> L2 Cache -> Main Memory (DRAM) -> Solid State Drive (NVMe)", "L1 Cache -> Main Memory (DRAM) -> L2 Cache -> Solid State Drive (NVMe)", "Main Memory (DRAM) -> L1 Cache -> L2 Cache -> Solid State Drive (NVMe)", "L2 Cache -> L1 Cache -> Solid State Drive (NVMe) -> Main Memory (DRAM)"], "correct_index": 0}}, {"id": "global-0025", "title": "Roofline Model Interpretation", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Is this layer compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because 50 TFLOPS is reached.", "Memory-bound, because the layer's arithmetic intensity (50) is less than the hardware's ridge point (100 FLOPs/byte).", "Compute-bound, because 100 TFLOPS is the absolute limiting factor for this layer.", "Memory-bound, because the layer achieves 100 TFLOPS at 50 FLOPs/byte."], "correct_index": 1}}, {"id": "global-0026", "title": "Optimizing Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the most effective way to improve the arithmetic intensity of the bias-plus-ReLU element-wise sequence?", "chain_ids": ["global-chain-auto-secondary-006-30"], "chain_positions": {"global-chain-auto-secondary-006-30": 0}, "chain_tiers": {"global-chain-auto-secondary-006-30": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantize the model to INT8 to reduce the compute time of the element-wise operations.", "Perform kernel fusion to combine the operations into a single kernel, reducing reads and writes to global memory.", "Increase the batch size to artificially inflate the FLOP count without changing memory accesses.", "Move the element-wise operations to the CPU to free up GPU tensor core resources."], "correct_index": 1}}, {"id": "global-0027", "title": "Batch Size and Compute Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does increasing the batch size affect the arithmetic intensity of a linear layer?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 0}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It decreases arithmetic intensity because larger batches require proportionally more memory to store the activations.", "It has no effect on arithmetic intensity since the model weights remain the exact same size regardless of batch.", "It increases arithmetic intensity by reusing the loaded model weights across multiple inputs, amortizing memory access costs.", "It increases arithmetic intensity by physically reducing the total number of FLOPs required to process the data."], "correct_index": 2}}, {"id": "global-0028", "title": "LLM Generation Phase Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does auto-regressive LLM decoding typically suffer from exceptionally low arithmetic intensity?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 1}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A single token is generated per step, requiring the model to load all weights and the entire KV cache from memory just to perform a small matrix-vector multiplication.", "The self-attention mechanism requires complex non-linear operations (like Softmax) that natively have low arithmetic intensity.", "Token generation requires frequent inter-GPU communication, which bottlenecks the compute operations.", "The context window is too small during generation, preventing the GPU from utilizing its tensor cores effectively."], "correct_index": 0}}, {"id": "global-0030", "title": "Layer 7 vs Layer 4 Load Balancing", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which type of load balancer is required for URL-path-based HTTP routing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Layer 3 Load Balancer", "Layer 4 Load Balancer", "Layer 7 Load Balancer", "DNS Round Robin"], "correct_index": 2}}, {"id": "global-0032", "title": "At-Least-Once Delivery Semantics", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What critical property must your consumer application implement to avoid corrupting data if a message is processed twice?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Eventual Consistency", "Idempotency", "ACID Transactions", "Two-Phase Commit"], "correct_index": 1}}, {"id": "global-0034", "title": "Post-Training Quantization", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the most likely cause of the severe accuracy drop after int8 post-training quantization on the 256KB-RAM microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 256KB RAM is completely exhausted because INT8 requires 4 bytes per parameter.", "There are extreme outliers in the model's weight or activation distributions, compressing normal values into just 3 distinct quantization levels.", "Int8 quantization mathematically guarantees a 50% accuracy drop on all CNNs without retraining.", "The scale factor of 0.003 is too small for the microcontroller's ALU to process."], "correct_index": 1}}, {"id": "global-0035", "title": "Memory Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Will the 150KB-parameter, 100KB-activation model fit on 512KB Flash and 128KB SRAM, and where should each reside?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, everything can be stored in SRAM since 150KB + 100KB = 250KB, and Flash can act as virtual memory.", "Yes, the model weights should be stored in Flash memory, and the intermediate tensors allocated in SRAM.", "No, because the total size (250KB) exceeds the available SRAM (128KB).", "No, because intermediate tensors must be stored in Flash to prevent data loss on power cycles."], "correct_index": 1}}, {"id": "global-0036", "title": "Structured vs. Unstructured Pruning", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why might 70% unstructured pruning fail to improve latency on a standard mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Unstructured pruning reduces latency by 70% because 70% of multiply-accumulate operations are skipped.", "NPUs require exactly 50% sparsity to trigger bypass logic for multiply-accumulate (MAC) operations.", "Standard NPUs lack specialized hardware to exploit irregular sparsity patterns, making unstructured sparse operations inefficient.", "Unstructured pruning increases latency by exactly 30% due to the O(N) indexing overhead of sparse tensors."], "correct_index": 2}}, {"id": "global-0037", "title": "Minimizing Radio Usage", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which architectural approach will typically result in the lowest overall power consumption?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Local TinyML detection with event-based transmission (0.14 Wh/day).", "Continuous cloud streaming detection (4.80 Wh/day).", "Local TinyML detection without transmission (4.80 Wh/day).", "Event-based transmission bypassing MCU (0.12 Wh/day)."], "correct_index": 0}}, {"id": "global-0038", "title": "Tensor Arena Allocation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does TFLM require a pre-allocated Tensor Arena instead of dynamic allocation during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Dynamic allocation requires a constant connection to a cloud-based memory manager.", "The Tensor Arena allows the model to compress weights at runtime to save Flash memory.", "Pre-allocation avoids memory fragmentation and ensures deterministic memory usage in resource-constrained, bare-metal environments.", "`malloc()` is only supported on 64-bit architectures, while most edge devices are 32-bit."], "correct_index": 2}}, {"id": "global-0039", "title": "The Dynamic DAG Scheduling Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you reduce the 9-second overhead of sequential LLM planning steps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0040", "title": "The Compound System Observability Stack", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design observability to detect and diagnose both performance and quality degradation in this 7-stage AI pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0041", "title": "The Agentic Memory Architecture", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design storage tiering, retrieval, and context-window management for this 500K-token coding-agent memory?", "chain_ids": ["global-chain-auto-secondary-017-54"], "chain_positions": {"global-chain-auto-secondary-017-54": 1}, "chain_tiers": {"global-chain-auto-secondary-017-54": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0042", "title": "The Multi-Tenant Vector Isolation Problem", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Which RAG indexing architecture should you choose for 500 tenants, and how would you balance cost, latency, isolation, and operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0043", "title": "The Experiment Reproducibility Crisis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you systematically debug why identical-code retraining drops the model from 94% to 91% accuracy?", "chain_ids": ["global-chain-auto-secondary-017-23"], "chain_positions": {"global-chain-auto-secondary-017-23": 1}, "chain_tiers": {"global-chain-auto-secondary-017-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0044", "title": "The 10,000-Experiment Infrastructure Challenge", "topic": "mlops-lifecycle", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design experiment tracking to scale from 10,000 to 100,000 experiments per month without training or query bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0045", "title": "The Full-Stack Constraint Propagation Workflow", "topic": "extreme-quantization", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the ML development workflow that propagates these constraints backward through the entire development lifecycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0046", "title": "The ML CI/CD Pipeline Design", "topic": "mlops-lifecycle", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why is microservice-style CI/CD incomplete for ML, and what ML-native CI/CD pipeline would you design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0047", "title": "The 3x Rule of Backpropagation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "global", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the backward pass of linear layers cost about 2x the forward pass, making training about 3x forward compute?", "chain_ids": ["global-chain-auto-secondary-017-61"], "chain_positions": {"global-chain-auto-secondary-017-61": 0}, "chain_tiers": {"global-chain-auto-secondary-017-61": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0048", "title": "Skip Connections as Gradient Highways", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "From a gradient flow perspective, why does y = f(x) + x enable 100+ layer networks when non-residual networks struggle beyond 20 layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0049", "title": "The Critical Batch Size", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Under what condition does doubling batch size halve training time, and when does that linear speedup break down?", "chain_ids": ["global-chain-auto-secondary-014-16"], "chain_positions": {"global-chain-auto-secondary-014-16": 0}, "chain_tiers": {"global-chain-auto-secondary-014-16": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0050", "title": "The Optimizer Memory Tradeoff at Scale", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "When would you consider SGD with momentum for a 175B LLM, and when is Adam's memory overhead justified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0051", "title": "MLPerf Execution Scenarios", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does MLPerf Inference need four separate scenarios, and what system property does each stress?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0052", "title": "The Disaggregated Evaluation Pipeline", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many samples per subgroup are needed to detect a 5-point accuracy gap with 80% power, and how large should the eval set be?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0053", "title": "The DRO Training Cost Multiplier", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How much more expensive is fairness-aware training with Group DRO, and is there a cheaper alternative that achieves 80% of the benefit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0054", "title": "MoE Decoding Latency Spike at High Batch Size", "topic": "mixture-of-experts", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the root cause of the 8x7B top-2 MoE slowing from 15ms/token at batch 1 to 45ms/token at batch 128?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0055", "title": "Continuous Roofline Profiling for Multimodal", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you design this profiling system to decompose the performance limiters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0056", "title": "E2E Edge Benchmarking Under Thermal Limits", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you build an end-to-end benchmark and profiler to isolate why P99 latency hits 2.8s after 30s on the 5W edge SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0057", "title": "The CPU Overhead Anomaly", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 7B LLM latency spike with mixed sequence lengths under CUDA graphs, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0058", "title": "The Memory-Bound Custom MLP", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the bottleneck in the CustomGLU MLP with 96% HBM bandwidth utilization, and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0059", "title": "The Phantom Memory Pool", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does CUDA graph capture OOM after eager warmup when the same model fits in eager mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0060", "title": "Tiered KV-Cache Page Size Tradeoffs", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you choose and manage KV-cache block sizes to minimize HBM fragmentation while keeping PCIe Gen5 prefetches efficient?", "chain_ids": ["global-chain-auto-secondary-014-02"], "chain_positions": {"global-chain-auto-secondary-014-02": 1}, "chain_tiers": {"global-chain-auto-secondary-014-02": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0061", "title": "LLM Serving Arithmetic Intensity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you scale the 30B FP16 LLM on one 80GB A100 from 33 tokens/sec to 500 tokens/sec without hitting KV-cache OOMs?", "chain_ids": ["global-chain-auto-secondary-014-16"], "chain_positions": {"global-chain-auto-secondary-014-16": 2}, "chain_tiers": {"global-chain-auto-secondary-014-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0062", "title": "Mitigating KV Cache Fragmentation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you redesign the static KV cache so the 13B model on a 40GB A100 can serve far more than 8 concurrent requests?", "chain_ids": ["global-chain-auto-secondary-014-02"], "chain_positions": {"global-chain-auto-secondary-014-02": 0}, "chain_tiers": {"global-chain-auto-secondary-014-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0063", "title": "Optimizer State NVMe Offloading", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What causes the 60-second optimizer step for the 70B fine-tune, and how would you reduce it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0064", "title": "SRAM Layer Fusion for Edge CNNs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you eliminate the DRAM bottleneck between Conv 3x3, ReLU, and Conv 1x1 on the 4MB-SRAM edge accelerator?", "chain_ids": ["global-chain-auto-secondary-014-15"], "chain_positions": {"global-chain-auto-secondary-014-15": 0}, "chain_tiers": {"global-chain-auto-secondary-014-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0065", "title": "Diagnosing Flat Ring AllReduce Latency", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you decompose the 106 ms synchronization time, and what is the true root cause?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0066", "title": "Root-Causing AllToAll Tail Latency", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 15-20 ms AllToAll tail latencies, and how would you mitigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0067", "title": "Petabyte-Scale Near-Duplicate Detection Tradeoffs", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design the MinHash+LSH pipeline to deduplicate 10B documents without an O(N^2) shuffle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0068", "title": "Multi-PB Distributed Streaming Dataloader", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the streaming data loader to globally shuffle 20 PB from S3 at 819 GB/s without local NVMe?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0069", "title": "Optimizing Checkpoint Frequency via Young-Daly", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What checkpoint interval and architecture would maximize goodput with a 6-hour MTBF and 4-minute synchronous checkpoint?", "chain_ids": ["global-chain-auto-secondary-016-07"], "chain_positions": {"global-chain-auto-secondary-016-07": 1}, "chain_tiers": {"global-chain-auto-secondary-016-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0070", "title": "Asynchronous Multi-Tier Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you reduce the 2.4 TB checkpoint pause from 120 seconds to under 5 seconds while preserving node-failure tolerance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0071", "title": "Fast Recovery via In-Memory Redundancy", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design recovery to restart in under 1 minute after a single-node fault instead of reloading 1.5 TB from storage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0072", "title": "Mitigating Network Stragglers in Synchronous Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you detect and mitigate silent 400 Gbps-to-10 Gbps link degradation during a 10,000-GPU run?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0073", "title": "Architecting SDC Detection in Large Clusters", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect an SDC detection mechanism that prevents saving corrupted checkpoints without doubling compute costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0074", "title": "Diagnosing InfiniBand Adaptive Routing Loop", "topic": "load-balancing", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is causing the 8 ms All-to-All tail latency with zero drops, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0075", "title": "3D Torus Deadlock and Credit Starvation", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused the 3D Torus network to freeze at 0 GB/s after a transient link flap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0076", "title": "RoCEv2 PFC Storm Propagation", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can one NIC that stops draining packets freeze the entire 1024-node RoCEv2 subnet within 2 ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0077", "title": "ECMP Hash Collisions with Elephant Flows", "topic": "load-balancing", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are two 400G uplinks saturated while two are idle during the 64-GPU AllReduce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0078", "title": "Long-Haul InfiniBand Buffer Credit Exhaustion", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does RDMA over the 20 km 400 Gbps dark-fiber link cap at about 16 Gbps despite zero optical errors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0079", "title": "GPUDirect RDMA NUMA Crossing Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is capping inter-node AllReduce at 120 GB/s on the 8-GPU, 4-NIC dual-socket node?", "chain_ids": ["global-chain-auto-secondary-016-14"], "chain_positions": {"global-chain-auto-secondary-016-14": 1}, "chain_tiers": {"global-chain-auto-secondary-016-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0080", "title": "800G PAM4 Signal Degradation from Dirty Fiber", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the likely cause of 800G OSFP links staying up but spiking to 250 ms latency and 10 Gbps throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0081", "title": "DCQCN Congestion Control Oscillation", "topic": "load-balancing", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 400G-to-5G sawtooth bandwidth oscillation under All-to-All load with zero drops?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0082", "title": "Topology-Unaware Scheduling on Oversubscribed Fabric", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 16-node job get only 66 GB/s AllReduce while contiguous-rack jobs get 200 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0083", "title": "Dragonfly Topology Congestion without Valiant Routing", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does Group 1 to Group 5 traffic in the Dragonfly collapse to 40 Gbps per node despite healthy global links?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0084", "title": "In-Network Computing (SHARP) Resource Exhaustion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 21st concurrent job lose SHARP-like 5μs AllReduce latency while the first 20 remain fast?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0085", "title": "PCIe ACS Blocking GPUDirect Peer-to-Peer DMA", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why are GPUDirect RDMA latency and bandwidth limited even when the GPU and NIC share a PCIe switch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0086", "title": "Multi-tenant LoRA Serving Architecture", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you design the inference architecture to minimize GPU cost while guaranteeing SLA across all tenants?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0087", "title": "Strict SLA Fraud Detection Ensembling", "topic": "queueing-theory", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "If traffic unexpectedly spikes to 80,000 QPS, how do you design the serving system to guarantee the 50ms SLA without dropping transactions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0088", "title": "Tradeoffs in High-Throughput Embedding Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you monitor 256-dimensional embedding drift at 100,000 QPS without adding 40 ms latency or OOMing 2 GB sidecars?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0089", "title": "Safe Deployment for Latency-Sensitive Generation Models", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should you continue the 10% canary or switch to shadow deployment for V2 under the 200 ms SLA, and why?", "chain_ids": ["global-chain-auto-secondary-011-33"], "chain_positions": {"global-chain-auto-secondary-011-33": 1}, "chain_tiers": {"global-chain-auto-secondary-011-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0090", "title": "Real-Time Global Toxicity Moderation Fairness Architecture", "topic": "responsible-ai", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you measure and mitigate dialect fairness issues at 100,000 QPS while keeping p99 latency under 50 ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0091", "title": "EU AI Act Compliance Pipeline Storage Design", "topic": "responsible-ai", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design the 10-year audit-log pipeline to satisfy EU AI Act evidence needs without violating GDPR minimization?", "chain_ids": ["global-chain-auto-secondary-017-66"], "chain_positions": {"global-chain-auto-secondary-017-66": 1}, "chain_tiers": {"global-chain-auto-secondary-017-66": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0092", "title": "Disaggregated Evaluation Cluster for Foundation Models", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you compute weekly fairness metrics over 10B images and 500 cohorts within the $25,000 evaluation budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0093", "title": "Federated Bias Mitigation in Mobile Health Diagnostics", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you preserve equal opportunity across 10 skin-tone groups without demographic data leaving devices or exceeding 200 TB/day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0094", "title": "Low-Latency Multi-Objective Recommendation Debiasing System", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you enforce creator exposure parity in the top-10 recommendations without breaking the 60 ms p99 SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0095", "title": "FP8 Gradient Casting Overflow Crisis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What caused this catastrophic failure?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0096", "title": "Cross-Lingual PTQ Activation Outliers", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did quantization break specific languages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0097", "title": "Asymmetric Quantization Throughput Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the smaller INT8 model slower than FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0098", "title": "The QAT Shadow Weight OOM", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing this massive memory inflation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0099", "title": "Depthwise Conv Per-Tensor Collapse", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do the Depthwise Convolution layers exhibit massive errors under per-tensor INT8 quantization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0100", "title": "Long-Context KV Cache RoPE Failure", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What interaction destroyed the long-context retrieval?", "chain_ids": ["global-chain-auto-secondary-017-30"], "chain_positions": {"global-chain-auto-secondary-017-30": 0}, "chain_tiers": {"global-chain-auto-secondary-017-30": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0101", "title": "The Unfolded BatchNorm Latency Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is there zero latency improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0102", "title": "Serving 70B LLMs on Single Node", "topic": "extreme-quantization", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do you architect the deployment of a 70B parameter LLM to meet 96GB memory constraints while maximizing generation throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0103", "title": "Hopper FP8 Format Mismatch", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L2", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should FP8 formats and scaling be chosen to preserve accuracy while meeting the H100 throughput target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0104", "title": "Long-Context VRAM Exhaustion", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing OOM at only 20 concurrent 64k-context requests, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0105", "title": "Recovering Edge Vision Accuracy", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you recover the drone detector's mAP while keeping the 50M model fully INT8 under the 5W NPU limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0106", "title": "Systematic Activation Outliers", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantize the 13B LLM to W8A8 without losing accuracy to 100x activation-channel outliers?", "chain_ids": ["global-chain-auto-secondary-014-14"], "chain_positions": {"global-chain-auto-secondary-014-14": 2}, "chain_tiers": {"global-chain-auto-secondary-014-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0107", "title": "RecSys Network Bottlenecks", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you fix the DLRM All-to-All bottleneck and meet the 30ms P99 latency budget without hurting accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0108", "title": "Static Calibration Distribution Shift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did static W8A8 PTQ lose 15 BLEU points on 500-token legal documents, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0109", "title": "Mixed-Precision Fleet Integration", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you deploy one 50B artifact while using FP8 on modern hardware and INT8 on older hardware to hit 50,000 QPS?", "chain_ids": ["global-chain-auto-secondary-017-44"], "chain_positions": {"global-chain-auto-secondary-017-44": 0}, "chain_tiers": {"global-chain-auto-secondary-017-44": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0110", "title": "LLM Decode Batch Size Limits", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Using the roofline model, why do latency spike and throughput plateau when Llama-70B batch size reaches 64?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 4}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0111", "title": "The Quantization Roofline Paradox", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did INT8 quantization yield only an 18% latency gain despite 4x higher INT8 TOPS and half the memory footprint?", "chain_ids": ["global-chain-auto-secondary-017-13"], "chain_positions": {"global-chain-auto-secondary-017-13": 2}, "chain_tiers": {"global-chain-auto-secondary-017-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0112", "title": "Recomputation and Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does FlashAttention-2 run 4x faster at 32k context even though it performs about 15% more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0113", "title": "MoE vs Dense Arithmetic Intensity", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the MoE decode throughput 30% lower despite having fewer active parameters per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0114", "title": "DLRM Heterogeneous Bottlenecks", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For the 600GB CTR model at 30,000 QPS, should you use high-bandwidth GPUs, high-capacity CPUs, or a hybrid architecture?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0115", "title": "Kernel Fusion Roofline Shift", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you quantitatively counter the claim that fusing 1%-FLOP element-wise ops cannot improve inference time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0116", "title": "Diffusion Resolution Scaling Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does doubling diffusion resolution from 512 to 1024 increase denoising latency by 9x instead of 4.5x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0117", "title": "Massive Context KV Cache Paging", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you page the 1M-token KV cache across HBM, host DRAM, and NVMe while sustaining 10 tokens/sec decoding speed?", "chain_ids": ["global-chain-auto-secondary-017-30"], "chain_positions": {"global-chain-auto-secondary-017-30": 1}, "chain_tiers": {"global-chain-auto-secondary-017-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0118", "title": "Long-Running Inference OOM Death", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L6+", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the 12-hour CUDA OOMs despite 45GB theoretical use, and how would you redesign KV memory management?", "chain_ids": ["global-chain-auto-secondary-014-02"], "chain_positions": {"global-chain-auto-secondary-014-02": 2}, "chain_tiers": {"global-chain-auto-secondary-014-02": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0119", "title": "SRAM Tiling for Custom Attention", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you structure registers, shared memory, and HBM access for sliding-window attention at seq_len 131,072?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0120", "title": "MoE Weight Fetching Bottleneck", "topic": "extreme-quantization", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why is the 8x7B MoE stuck at 8 tokens/sec at batch size 1, and how would you reach the 40 tokens/sec SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0121", "title": "High-Density LoRA Adapter Swapping", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you architect a multi-tier memory hierarchy to serve 10,000 adapters without violating the 200ms SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0122", "title": "Multi-Tier Embedding Lookups", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you architect 5TB DLRM embedding lookups across HBM, host RAM, and NVMe for 10,000 QPS under 50ms P99?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0123", "title": "3D CNN Activation Checkpointing", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What hybrid recompute/offload strategy would fit 180GB of 3D UNet activations on an 80GB GPU at 1.5 iter/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0124", "title": "Edge LLM Unified Memory Sizing", "topic": "extreme-quantization", "competency_area": "optimization", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you lay out and quantize the 8B model and 8k KV cache to hit 20 tokens/sec without SSD swapping on 16GB Apple Silicon?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0125", "title": "The Operator Fusion Trap", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why did fusing the four element-wise ops make the 100M-element FP32 kernel take 150ms instead of the expected 25ms?", "chain_ids": ["global-chain-auto-secondary-014-15"], "chain_positions": {"global-chain-auto-secondary-014-15": 1}, "chain_tiers": {"global-chain-auto-secondary-014-15": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0126", "title": "Depthwise Separable Disappointment", "topic": "roofline-analysis", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why did replacing 3x3 convolutions with depthwise separable convolutions slow the edge NPU despite cutting MACs from 2.5G to 0.3G?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0127", "title": "The H100 Speedup Discrepancy", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the H100 upgrade speed up 70B prefill by 3.1x but decode by only 1.6x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0128", "title": "The FlashAttention Paradox", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does FlashAttention-2 slow the ViT attention from 0.8ms to 1.2ms at sequence length 196?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0129", "title": "The Power-Clock Anomaly", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does underclocking cores by 40% barely affect Whisper batch-1 throughput while cutting power by 35%?", "chain_ids": ["global-chain-auto-secondary-014-04"], "chain_positions": {"global-chain-auto-secondary-014-04": 2}, "chain_tiers": {"global-chain-auto-secondary-014-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0130", "title": "Mobile NPU Utilization Collapse", "topic": "roofline-analysis", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 4K super-resolution model run at 333ms instead of the 10ms implied by the 10 TOPS NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0131", "title": "The INT4 Quantization Plateau", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does INT4 weight-only quantization plateau at 12ms/token after INT8 halved Llama-2 13B decode latency?", "chain_ids": ["global-chain-auto-secondary-016-13"], "chain_positions": {"global-chain-auto-secondary-016-13": 1}, "chain_tiers": {"global-chain-auto-secondary-016-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0132", "title": "The KV Cache Roofline Collapse", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does increasing the batch size from 16 to 128 drop attention kernel throughput by 2.5x on an A100?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0133", "title": "Continuous Batching Decode Spikes", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 4k-token prompts cause P99 TPOT spikes to 200ms+, and how would you keep TPOT under 50ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0134", "title": "Speculative Decoding Economics", "topic": "speculative-decoding", "competency_area": "optimization", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding with a 7B draft model worsen TPOT to 32ms, and how would you meet the 25ms SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0135", "title": "Paged Attention Internal Fragmentation", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does PagedAttention with 512-token blocks OOM at batch size 128 for 80-token translation requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0136", "title": "Disaggregated Prefill Network Bottleneck", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does disaggregated prefill/decode raise client TTFT to about 400ms over 100 Gbps Ethernet, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0137", "title": "Prompt Caching PCIe Bottleneck", "topic": "compound-ai-systems", "competency_area": "architecture", "track": "global", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does loading the cached 50k-token KV prefix over PCIe Gen4 leave TTFT at 700ms, and how would you meet 500ms?", "chain_ids": ["global-chain-auto-secondary-017-54"], "chain_positions": {"global-chain-auto-secondary-017-54": 0}, "chain_tiers": {"global-chain-auto-secondary-017-54": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0138", "title": "Memory Bandwidth Bound Decode", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is 20ms TPOT impossible for a 70B FP16 model on 2x GPUs, and what architecture could meet it?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 3}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0139", "title": "Continuous Batching Queueing Death Spiral", "topic": "batching-strategies", "competency_area": "latency", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does P99 latency climb past 5 minutes at 2 QPS even though TPOT is flat at 30ms, and how would you fix capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0140", "title": "Long Context Attention FLOPs Wall", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Why does FlashAttention-2 avoid OOM but still give over 6 minutes TTFT for a 500k-token prefill, and what must change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0141", "title": "The PagedAttention OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does 512-token PagedAttention block sizing OOM at 64 short translation requests on an 80GB A100?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0142", "title": "The Speculative Decoding Paradox", "topic": "speculative-decoding", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does speculative decoding improve TPOT at 1 QPS but collapse throughput at 50 QPS, and how should you handle it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0143", "title": "Chunked Prefill Latency Spikes", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why do 4096-token prefill chunks make active Mixtral decode TPOT spike from 30ms to 116ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0144", "title": "The Poisoned Prefix Cache", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the RadixAttention prefix cache hit rate 0% despite reused system prompts, and how would you restore it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0145", "title": "Continuous Batching Generation Deadlock", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the vLLM engine to stall after 10 seconds, and how would you prevent it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0146", "title": "The Multi-LoRA Throughput Cliff", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does throughput collapse when a batch contains 32 different LoRA adapters despite 100% GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0147", "title": "Ring Attention Compute Starvation", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 1M-token Ring Attention prefill stall with 1024-token chunks, and what chunk-size change would you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0148", "title": "The QSPI Thrashing Spikes", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does increasing the on-device fine-tuning batch size from 1 to 2 make epoch time jump from 400ms to 3.2s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0149", "title": "The Unaligned Power Drain", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the CMSIS-NN keyword spotting model use more energy per inference even though latency falls from 45ms to 12ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0151", "title": "The Fragmented FL Round", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the 30KB gradient buffer allocation fail in round 6 despite 80KB of free heap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0152", "title": "The Strided Cache Miss", "topic": "graph-compilation", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the custom 2D convolution taking 80ms instead of the 15ms suggested by its MAC count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0153", "title": "The Wear-Out Checkpoint", "topic": "federated-learning", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the edge devices to hard fault during weight update serialization after 45 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0154", "title": "The Bus Contention Stall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does inference latency rise from 20ms to 28ms when DMA writes Pong while the CPU processes Ping?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0155", "title": "The I-Cache Thrash Loop", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the unrolled CMSIS-NN kernel halve battery life even though it still meets the 10ms deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0156", "title": "XIP Flash Cache Thrashing", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the FL sparsity update make the 200KB keyword spotting model slower and higher power despite 10% fewer FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0157", "title": "PSRAM Strided Access Overhead", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why is the ESP32-S3 backward pass about 10x slower and thermally worse than the forward pass with the same MAC count?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0158", "title": "DMA and AXI SRAM Bus Contention", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What causes the few-microsecond inference jitter and slight power increase when ADC DMA and the TFLite Micro model share AXI SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0160", "title": "Flash Block Erase Write Amplification", "topic": "federated-learning", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the nightly 64KB Flash weight-delta write grow from 120ms to 1.8s after 6 months?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0161", "title": "BLE DMA External RAM Wakeups", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does BLE transmission of the 30KB FL gradient buffer draw 8mA instead of letting the CPU sleep at 2mA radio current?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0162", "title": "SRAM Overlay Thrashing in Backprop", "topic": "federated-learning", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 3-layer FL backward pass take 2.4s instead of the 250ms predicted by FLOP counts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0163", "title": "The Overclocking Energy Trap", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 15% GPU frequency boost reduce training time but increase total energy by nearly 40%?", "chain_ids": ["global-chain-auto-secondary-014-04"], "chain_positions": {"global-chain-auto-secondary-014-04": 1}, "chain_tiers": {"global-chain-auto-secondary-014-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0164", "title": "The Quantization Cost Paradox", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does switching the 70B model from FP16 on 4 GPUs to W8A8 INT8 on 2 GPUs increase cost per 1K tokens?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0165", "title": "The Federated Battery Drain", "topic": "federated-learning", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does pruning the FL payload from 50MB to 5MB increase battery drain when sync frequency rises from every 100 steps to every 10?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0166", "title": "The Thermal Throttling Ring", "topic": "thermal-management", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does the 64-node MoE training job lose 25% throughput every day between 2 PM and 6 PM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0167", "title": "The Carbon-Aware Checkpoint Penalty", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why did pausing the 256-GPU job during high-carbon periods increase total carbon emissions by 12%?", "chain_ids": ["global-chain-auto-secondary-017-28"], "chain_positions": {"global-chain-auto-secondary-017-28": 0}, "chain_tiers": {"global-chain-auto-secondary-017-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0168", "title": "The PUE Illusion", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the datacenter PUE worsen from 1.15 to 1.30 when switching from dense LLM training to memory-bound MoE?", "chain_ids": ["global-chain-auto-secondary-017-57"], "chain_positions": {"global-chain-auto-secondary-017-57": 0}, "chain_tiers": {"global-chain-auto-secondary-017-57": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0169", "title": "The Power Capping Headroom Paradox", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does a 200W power cap cause p99 latency to jump above 500ms even though average utilization is 45% and power is 150W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0173", "title": "The Data Residency Constraint", "topic": "mlops-lifecycle", "competency_area": "cross-cutting", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Under GDPR, what is 'data residency' and why does it constrain where you can place your training infrastructure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0174", "title": "The Grid Emissions Gap", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If you run the same 1,000 GPU-hour training job in each country, how does the carbon footprint differ?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 0}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0175", "title": "The Edge Cache Inference Pattern", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What serving architecture exploits this redundancy to minimize inference costs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0176", "title": "The GPU Cluster Power Wall", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total power draw, and why is securing this amount of power a major infrastructure challenge?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 0}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0177", "title": "The Risk Classification Framework", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Which risk tier does it fall under, and what are the compliance obligations?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0178", "title": "The Gradient Staleness Problem", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is this phenomenon called, and why is it harmful?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0179", "title": "The Anycast Routing Pattern", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "With anycast routing, how does the network decide which datacenter handles the request?", "chain_ids": ["global-chain-auto-secondary-011-31"], "chain_positions": {"global-chain-auto-secondary-011-31": 0}, "chain_tiers": {"global-chain-auto-secondary-011-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0180", "title": "The Gradient Sync Penalty", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does a full FP32 gradient AllReduce for the 10B-parameter model take over the 100 Gbps, 60ms RTT cross-DC link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0181", "title": "The CO2 Per Training Run", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the total CO2 emissions, in metric tons, for the 256-A100 training run with PUE 1.3 and 400 gCO2/kWh grid intensity?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 1}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0183", "title": "The Electricity Arbitrage", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What annual savings do you get by scheduling the 1,000-GPU batch workload during the 8 off-peak hours instead of peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0184", "title": "The Gradient Compression Dividend", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After Top-1% sparsification with 4-byte values and 4-byte indices, what are the new AllReduce time and effective compression ratio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0185", "title": "The Model Sync Cost", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long does a full model sync take, and what is the monthly egress cost at $0.08/GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0186", "title": "The Async Pipeline Throughput Limit", "topic": "pipeline-parallelism", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "With 4 microbatches in flight, what are the pipeline bubble and steady-state throughput across the two 50ms stages and 40ms cross-DC link?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0187", "title": "The Right-to-Be-Forgotten Cost", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If retraining costs $2/GPU-hour, what does each deletion request cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0188", "title": "The Follow-the-Sun Savings", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much CO2 does the 48-hour job emit running entirely in Region C versus a 'follow-the-renewables' schedule, and what reduction does that imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0189", "title": "The Time-to-First-Token Across Oceans", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What TTFT will the London and Tokyo users see, do they meet the 200 ms SLA, and what would fix Tokyo if it misses?", "chain_ids": ["global-chain-auto-secondary-011-31"], "chain_positions": {"global-chain-auto-secondary-011-31": 1}, "chain_tiers": {"global-chain-auto-secondary-011-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0191", "title": "The Latency-Carbon-Cost Triangle", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Which European region should you deploy in given the 500 ms TTFT SLA, 400 ms prefill, latency, cost, and carbon tradeoffs?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 3}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0192", "title": "The Data Locality Penalty", "topic": "data-efficiency-selection", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which approach—separate country models, federated learning, or synthetic data—best balances compliance, quality, and patient safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0193", "title": "The Cable Cut Failover", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the impact of the cable cut on Asian inference traffic, and which failover strategy would you choose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0194", "title": "The Green AI Paradox", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does moving the 256-GPU training job from Virginia to Quebec reduce net carbon after data transfer and latency costs?", "chain_ids": ["global-chain-auto-secondary-017-28"], "chain_positions": {"global-chain-auto-secondary-017-28": 1}, "chain_tiers": {"global-chain-auto-secondary-017-28": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0195", "title": "The Inference Placement Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the translation service use one 64-H100 US-East cluster or eight 8-H100 regional clusters to meet the 300 ms P95 latency target?", "chain_ids": ["global-chain-auto-secondary-011-31"], "chain_positions": {"global-chain-auto-secondary-011-31": 2}, "chain_tiers": {"global-chain-auto-secondary-011-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0196", "title": "The GPAI Threshold Debate", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does the 70B model trained on 2,048 GPUs for 90 days exceed the EU AI Act 10^25 FLOP threshold, and what does that imply?", "chain_ids": ["global-chain-auto-secondary-017-66"], "chain_positions": {"global-chain-auto-secondary-017-66": 0}, "chain_tiers": {"global-chain-auto-secondary-017-66": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0197", "title": "The Renewable Intermittency Trap", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you schedule inference and batch training on solar and grid power to minimize cost while keeping inference available 24/7?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 3}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0198", "title": "The Regional Model Routing Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should all users get the same model, or should Asian users get the smaller, faster model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0199", "title": "The Multi-Region Checkpoint Strategy", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate local hourly, local hourly + remote 6h, and remote hourly checkpoint strategies for expected loss and overhead tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0200", "title": "Matrix Multiply Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of the 4096×4096 FP16 matmul, and is it compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0201", "title": "KV Cache Memory for 7B Model Serving", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much KV cache memory does each 4096-token request need, and how many concurrent requests fit alongside the model weights on an 80GB GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0202", "title": "INT8 Quantization Serving Throughput Gain", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "After quantizing the memory-bound 13B FP16 decode model to INT8, what throughput should you expect?", "chain_ids": ["global-chain-auto-secondary-017-13"], "chain_positions": {"global-chain-auto-secondary-017-13": 1}, "chain_tiers": {"global-chain-auto-secondary-017-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0203", "title": "Energy Cost of a Training Run", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total electricity cost for the 64-GPU, 72-hour training run including PUE?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 1}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0204", "title": "Ring AllReduce Communication Time", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long should the 2 GB gradient ring AllReduce take across 8 A100s over NVLink?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0205", "title": "PCIe vs NVLink Model Shard Transfer", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does the 10 GB tensor shard transfer take over PCIe Gen4 x16 versus NVLink 4.0?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0206", "title": "Buy vs Rent GPU Break-Even", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "At 80% utilization, after how many months does buying an A100-80GB break even versus $2.50/GPU-hour cloud rental?", "chain_ids": ["global-chain-auto-secondary-017-45"], "chain_positions": {"global-chain-auto-secondary-017-45": 1}, "chain_tiers": {"global-chain-auto-secondary-017-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0207", "title": "Self-Attention Arithmetic Intensity During Decode", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of single-token attention with a 2048-token FP16 KV cache, and where does it fall on the A100 roofline?", "chain_ids": ["global-chain-auto-secondary-006-29"], "chain_positions": {"global-chain-auto-secondary-006-29": 2}, "chain_tiers": {"global-chain-auto-secondary-006-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0208", "title": "Activation Checkpointing Memory Savings", "topic": "activation-memory", "competency_area": "memory", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much activation memory is needed with no checkpointing versus checkpointing every 4 layers?", "chain_ids": ["global-chain-auto-secondary-017-59"], "chain_positions": {"global-chain-auto-secondary-017-59": 0}, "chain_tiers": {"global-chain-auto-secondary-017-59": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0209", "title": "Continuous Batching Throughput vs Latency", "topic": "batching-strategies", "competency_area": "latency", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What system throughput gain and per-request latency change should continuous batching from 1 to 32 requests produce?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0210", "title": "Kernel Fusion Memory Bandwidth Savings", "topic": "kernel-fusion", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much HBM traffic is saved by fusing the LayerNorm, GELU, and Dropout kernels for the (32, 2048, 4096) FP16 tensor?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 1}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0211", "title": "GPU Rack Power Density Limit", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How many 8-GPU servers fit in a 20 kW rack, and how many GPUs per rack is that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0212", "title": "Roofline Classification of Elementwise Operations", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of ReLU on 100M FP16 elements, and what maximum TFLOPS can it achieve on an A100?", "chain_ids": ["global-chain-auto-secondary-006-30"], "chain_positions": {"global-chain-auto-secondary-006-30": 1}, "chain_tiers": {"global-chain-auto-secondary-006-30": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0213", "title": "4-Bit Quantization for Consumer GPU Deployment", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the FP16, INT8, and INT4 memory footprints for the 70B model on a 24 GB GPU, and can it fit at INT4?", "chain_ids": ["global-chain-auto-secondary-016-13"], "chain_positions": {"global-chain-auto-secondary-016-13": 0}, "chain_tiers": {"global-chain-auto-secondary-016-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0214", "title": "Cost Per 1M Tokens Served", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the marginal compute cost per 1M output tokens for the serving 2000 tokens/s at $3.50/hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0215", "title": "Tensor Parallelism Communication Volume", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the AllReduce volume per layer per forward pass under 4-way tensor parallelism, assuming batch=1 and seq_len=2048?", "chain_ids": ["global-chain-auto-secondary-017-58"], "chain_positions": {"global-chain-auto-secondary-017-58": 1}, "chain_tiers": {"global-chain-auto-secondary-017-58": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0216", "title": "Optimal Checkpointing Interval", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What checkpoint interval minimizes wasted compute for the cluster, and what overhead does it imply?", "chain_ids": ["global-chain-auto-secondary-016-07"], "chain_positions": {"global-chain-auto-secondary-016-07": 0}, "chain_tiers": {"global-chain-auto-secondary-016-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0217", "title": "Mixed-Precision Training Memory Budget", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much GPU memory is needed for mixed-precision Adam model state for the 3B model, and does it fit on one GPU?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 2}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0218", "title": "Cross-Rack AllReduce Latency Impact", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How much slower is a 4 GB AllReduce across two racks over NDR versus within one NVSwitch rack?", "chain_ids": ["global-chain-auto-secondary-016-14"], "chain_positions": {"global-chain-auto-secondary-016-14": 0}, "chain_tiers": {"global-chain-auto-secondary-016-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0220", "title": "Voltage Scaling and Dynamic Power", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the new dynamic power after reducing voltage and frequency by 15%?", "chain_ids": ["global-chain-auto-secondary-014-04"], "chain_positions": {"global-chain-auto-secondary-014-04": 0}, "chain_tiers": {"global-chain-auto-secondary-014-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0221", "title": "H100 vs A100 Roofline Comparison", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What are the H100 and A100 ridge points, and how much speedup does H100 give at 200 FLOP/byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0223", "title": "torch.compile Warm-Up vs Steady-State", "topic": "graph-compilation", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For a 10,000-step run, when does torch.compile break even and how much total time does it save?", "chain_ids": ["global-chain-auto-secondary-017-62"], "chain_positions": {"global-chain-auto-secondary-017-62": 0}, "chain_tiers": {"global-chain-auto-secondary-017-62": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0224", "title": "Little's Law for GPU Inference Server Sizing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many GPUs are needed to handle 500 requests/s at 200 ms latency with 8 concurrent requests per GPU without queuing?", "chain_ids": ["global-chain-auto-secondary-011-32"], "chain_positions": {"global-chain-auto-secondary-011-32": 0}, "chain_tiers": {"global-chain-auto-secondary-011-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0225", "title": "Diagnosing Prefill-Decode Interference", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What likely causes the p99 token latency spikes when long prefills arrive during short decodes, and how would you mitigate it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0226", "title": "GPU Shows 40% Utilization Despite Full Batch", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why might GPU utilization be only 40% despite a fully packed batch, and how would you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0227", "title": "OOM at Step 500 but Not Step 1", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What most likely causes the gradual memory climb and OOM at step 500, and how would you debug it?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 3}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0228", "title": "When INT8 Quantization Hurts More Than It Helps", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can INT8 quantization preserve perplexity yet drop RAG factual accuracy from 89% to 71%, and what should you do?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0229", "title": "Pipeline Parallelism Bubble Overhead Analysis", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the bubble overhead and total time for the 4-stage/16-microbatch versus 8-stage/32-microbatch pipelines?", "chain_ids": ["global-chain-auto-secondary-017-64"], "chain_positions": {"global-chain-auto-secondary-017-64": 1}, "chain_tiers": {"global-chain-auto-secondary-017-64": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0230", "title": "NaN Gradients Appearing After Learning Rate Warmup", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What likely causes NaN gradients right after LR warmup in FP16 mixed-precision training, and how would you fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0231", "title": "ECMP Hash Collision Causing Training Slowdown", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What explains deterministic AllReduce slowdowns for the same node pairs when average network use is low but some links are saturated?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0232", "title": "Spot Instance Strategy for Fault-Tolerant Training", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze whether spot instances are cost-effective for a 100-hour training run?", "chain_ids": ["global-chain-auto-secondary-017-46"], "chain_positions": {"global-chain-auto-secondary-017-46": 0}, "chain_tiers": {"global-chain-auto-secondary-017-46": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0233", "title": "Carbon-Aware Scheduling Tradeoff", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the carbon tradeoff of running during Region A's daytime vs Region B anytime, considering a 15% compute overhead from cross-region data transfer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0234", "title": "KV Cache Eviction Under Memory Pressure", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How should you choose a KV cache eviction policy under memory pressure for 50 active inference requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0235", "title": "Why FlashAttention is Faster Despite More FLOPs", "topic": "flash-attention", "competency_area": "memory", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can FlashAttention run faster than standard attention despite doing extra recomputation FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0236", "title": "Diagnosing Stragglers in Synchronous Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing the intermittent stragglers across random GPUs in synchronous training?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0237", "title": "torch.compile Recompilation Storm", "topic": "graph-compilation", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes this compilation storm and how do you fix it?", "chain_ids": ["global-chain-auto-secondary-017-62"], "chain_positions": {"global-chain-auto-secondary-017-62": 1}, "chain_tiers": {"global-chain-auto-secondary-017-62": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0238", "title": "Tail Latency Sources in Multi-Model Serving", "topic": "tail-latency", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do tail latencies compound super-linearly in this sequential pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0239", "title": "Optimal Power Cap for Training Cost Minimization", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you analyze the cost-optimal power cap?", "chain_ids": ["global-chain-auto-secondary-014-03"], "chain_positions": {"global-chain-auto-secondary-014-03": 2}, "chain_tiers": {"global-chain-auto-secondary-014-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0240", "title": "Optimizing a Memory-Bound Training Bottleneck", "topic": "kernel-fusion", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an optimization strategy to reduce this overhead by at least 50%?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 2}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0241", "title": "Maximizing Concurrent Users on Fixed GPU Budget", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design optimizations to maximize concurrent users within the same hardware and SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0242", "title": "Choosing 3D Parallelism Configuration", "topic": "3d-parallelism", "competency_area": "parallelism", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the optimal (TP, PP, DP) configuration?", "chain_ids": ["global-chain-auto-secondary-017-58"], "chain_positions": {"global-chain-auto-secondary-017-58": 2}, "chain_tiers": {"global-chain-auto-secondary-017-58": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0243", "title": "Optimizing Time-to-First-Token for Interactive Chat", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an optimization plan targeting the prefill phase specifically?", "chain_ids": ["global-chain-auto-secondary-010-07"], "chain_positions": {"global-chain-auto-secondary-010-07": 0}, "chain_tiers": {"global-chain-auto-secondary-010-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0244", "title": "Reducing AllReduce Bottleneck in Cross-Rack Training", "topic": "collective-communication", "competency_area": "networking", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design optimizations to reduce communication overhead below 15%?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0245", "title": "Right-Sizing an Inference Fleet for Variable Traffic", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the optimal fleet composition to minimize annual cost while meeting peak demand?", "chain_ids": ["global-chain-auto-secondary-011-32"], "chain_positions": {"global-chain-auto-secondary-011-32": 1}, "chain_tiers": {"global-chain-auto-secondary-011-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0246", "title": "Designing a Mixed-Precision Serving Strategy", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design a mixed-precision quantization strategy to meet the 2000 tok/s target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0247", "title": "Optimizing Checkpoint Strategy for 1000-GPU Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a checkpointing strategy to reduce waste below 15%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0248", "title": "Optimizing a Compiled Inference Graph for Throughput", "topic": "kernel-fusion", "competency_area": "optimization", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a compilation optimization strategy for decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0249", "title": "Maximizing Tokens-per-Watt for Sustainable Inference", "topic": "sustainability-carbon-accounting", "competency_area": "power", "track": "global", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you design an optimization plan to meet the carbon target without reducing throughput?", "chain_ids": ["global-chain-auto-secondary-017-27"], "chain_positions": {"global-chain-auto-secondary-017-27": 2}, "chain_tiers": {"global-chain-auto-secondary-017-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0250", "title": "HBM Bandwidth Ceiling on A100", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the peak HBM2e memory bandwidth of an A100-80GB GPU, and how does it compare to the H100's HBM3 bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0251", "title": "Arithmetic Intensity Definition", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the arithmetic intensity of a matrix-vector multiply y = Wx where W is 4096x4096 in FP16?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 0}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0252", "title": "Bytes per Parameter Across Precisions", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the total VRAM footprint for a 70B parameter model in each format?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0253", "title": "TDP and Energy Cost of GPU Hours", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much energy (in kWh) does a single A100 consume running at full power for 24 hours, and what does this cost at $0.10/kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0254", "title": "NVLink vs PCIe Bandwidth Gap", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the bidirectional bandwidth of NVLink 4.0 vs PCIe Gen5 x16, and why does this gap matter for multi-GPU training?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 0}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0255", "title": "Three Axes of Parallelism", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What are data, tensor, and pipeline parallelism in distributed LLM training, and what does each partition?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 0}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0256", "title": "The A100 Ridge Point", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What does this number physically mean?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0257", "title": "GPU-Hour Cost Decomposition", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the approximate cost components (hardware amortization, electricity, cooling, networking, staff) that make up this price?", "chain_ids": ["global-chain-auto-secondary-017-45"], "chain_positions": {"global-chain-auto-secondary-017-45": 0}, "chain_tiers": {"global-chain-auto-secondary-017-45": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0258", "title": "KV Cache Memory per Token", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How much KV cache memory is needed per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0259", "title": "Why Kernel Fusion Matters", "topic": "kernel-fusion", "competency_area": "latency", "track": "global", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is kernel fusion, why does it improve GPU performance, and what primary bottleneck does it eliminate?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 0}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0260", "title": "Training Memory Budget Breakdown", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why training a 1B parameter model in mixed-precision (FP16 forward/backward + FP32 Adam optimizer) requires approximately 16 GB of memory, not just the 2 GB for FP16 weights?", "chain_ids": ["global-chain-auto-secondary-014-01"], "chain_positions": {"global-chain-auto-secondary-014-01": 1}, "chain_tiers": {"global-chain-auto-secondary-014-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0261", "title": "Memory-Bound vs Compute-Bound Intuition", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why is batch-1 autoregressive LLM decoding memory-bandwidth-bound while large-batch training is compute-bound, according to the roofline model?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 1}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0262", "title": "AllReduce Communication Overhead", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Why does this approach O(2x gradient size) as N grows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0263", "title": "Why Activation Outliers Break Quantization", "topic": "vram-budgeting", "competency_area": "memory", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does a single activation outlier destroy INT8 quantization accuracy, and how do techniques like LLM.int8() address this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0264", "title": "InfiniBand vs Ethernet for Training", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What specific properties of InfiniBand reduce AllReduce latency compared to Ethernet?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 1}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0265", "title": "INT8 vs INT4 for Production LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Compare INT8 vs INT4 quantization for a 70B model serving 256-token responses under a 200ms SLA. Which do you choose and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0266", "title": "Tensor vs Pipeline Parallelism for 70B", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Compare TP=8 (tensor parallel across all 8) vs PP=8 (pipeline parallel, 8 stages). Which is better for this configuration and why?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 2}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0267", "title": "Spot vs On-Demand for Long Training Runs", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate: spot, on-demand, or a hybrid approach?", "chain_ids": ["global-chain-auto-secondary-017-46"], "chain_positions": {"global-chain-auto-secondary-017-46": 1}, "chain_tiers": {"global-chain-auto-secondary-017-46": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0268", "title": "Static vs Continuous Batching for LLM Serving", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you quantify the throughput difference for a batch of 32 requests where lengths range from 50 to 1500 tokens?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0269", "title": "A100 vs H100 Performance per Watt", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many PFLOPS can you achieve with A100s versus H100s in this 2MW facility?", "chain_ids": ["global-chain-auto-secondary-017-57"], "chain_positions": {"global-chain-auto-secondary-017-57": 1}, "chain_tiers": {"global-chain-auto-secondary-017-57": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0270", "title": "CPU Offloading vs Activation Recomputation", "topic": "activation-memory", "competency_area": "memory", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which is faster for fitting activations on one A100: CPU offloading over PCIe or activation recomputation?", "chain_ids": ["global-chain-auto-secondary-017-59"], "chain_positions": {"global-chain-auto-secondary-017-59": 2}, "chain_tiers": {"global-chain-auto-secondary-017-59": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0271", "title": "Tensor Parallelism Within vs Across Nodes", "topic": "interconnect-topology", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you quantify the communication overhead of each approach?", "chain_ids": ["global-chain-auto-secondary-017-26"], "chain_positions": {"global-chain-auto-secondary-017-26": 1}, "chain_tiers": {"global-chain-auto-secondary-017-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0272", "title": "Prefill Chunking vs Monolithic Prefill", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the latency and throughput tradeoffs between monolithic and chunked prefill?", "chain_ids": ["global-chain-auto-secondary-010-07"], "chain_positions": {"global-chain-auto-secondary-010-07": 1}, "chain_tiers": {"global-chain-auto-secondary-010-07": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0273", "title": "Eager vs Compiled Execution for Inference", "topic": "kernel-fusion", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate when compilation pays for itself and identify scenarios where it doesn't?", "chain_ids": ["global-chain-auto-secondary-017-55"], "chain_positions": {"global-chain-auto-secondary-017-55": 3}, "chain_tiers": {"global-chain-auto-secondary-017-55": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0274", "title": "GPU vs CPU Data Preprocessing for Training", "topic": "mlops-lifecycle", "competency_area": "data", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you compare doubling CPU cores, GPU preprocessing, and offline preprocessed shards for this training pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0275", "title": "On-Prem vs Cloud GPU Cluster Economics", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "At what utilization does an on-prem deployment break even compared to cloud renting?", "chain_ids": ["global-chain-auto-secondary-017-45"], "chain_positions": {"global-chain-auto-secondary-017-45": 2}, "chain_tiers": {"global-chain-auto-secondary-017-45": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0276", "title": "Latency Optimization vs Throughput Optimization", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate which configuration is better for (a) a real-time chatbot and (b) a batch document summarization pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0278", "title": "Batch Size to Reach Compute-Bound Regime", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What minimum batch size pushes the main GEMM operations into the compute-bound regime, and what are the latency implications?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 3}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0279", "title": "Multi-LoRA Serving Architecture", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a serving architecture that minimizes GPU cost while meeting P99 < 300ms TTFT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0280", "title": "Resilient 1024-GPU Training System", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a fault-tolerant training system for a 175B model on 1024 GPUs where individual GPU failures occur every 2-4 hours?", "chain_ids": ["global-chain-auto-secondary-017-07"], "chain_positions": {"global-chain-auto-secondary-017-07": 1}, "chain_tiers": {"global-chain-auto-secondary-017-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0281", "title": "Multi-Model Serving Gateway Design", "topic": "mlops-lifecycle", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a serving gateway for a company running 15 LLMs (7B to 70B) across 48 80GB GPUs?", "chain_ids": ["global-chain-auto-secondary-017-44"], "chain_positions": {"global-chain-auto-secondary-017-44": 1}, "chain_tiers": {"global-chain-auto-secondary-017-44": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0282", "title": "3D Parallelism Configuration for 175B", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the parallelism configuration (DP, TP, PP, micro-batch size) to train a 175B GPT-style model with global batch size 2048 and sequence length 2048?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 3}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0283", "title": "Trillion-Token Data Pipeline Architecture", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a data pipeline to feed this 1024-GPU training cluster under these constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0284", "title": "Training Observability Dashboard Design", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What metrics, alerts, and dashboards do you build?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0285", "title": "Memory Budget for High-Concurrency LLM Serving", "topic": "memory-pressure-management", "competency_area": "memory", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the binding constraint that prevents serving 256 concurrent requests, and how would you resolve it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0286", "title": "Network Topology for 2048-GPU Training Cluster", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the optimal switch topology, link speeds, and total switch port count for this cluster?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 3}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0289", "title": "Power and Cooling for a 1000-GPU Cluster", "topic": "energy-per-operation", "competency_area": "power", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you specify the cooling technology, power distribution, and UPS configuration?", "chain_ids": ["global-chain-auto-secondary-017-57"], "chain_positions": {"global-chain-auto-secondary-017-57": 2}, "chain_tiers": {"global-chain-auto-secondary-017-57": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0291", "title": "Speculative Decoding Architecture", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you calculate the expected speedup assuming a 70% draft acceptance rate and specify memory allocation on an 80GB GPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0292", "title": "Expert Parallelism for MoE Training", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you specify the expert partitioning, handle token routing all-to-all communication, and estimate the communication overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0293", "title": "Real-Time Tokenization Pipeline at Scale", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "global", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you specify the architecture to achieve 10M tokens/sec throughput?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0294", "title": "TTFT Latency Spike Diagnosis", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you diagnose the root cause of the TTFT latency spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0295", "title": "Sudden Training Throughput Drop", "topic": "interconnect-topology", "competency_area": "parallelism", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is happening to cause this sudden 38% throughput drop?", "chain_ids": ["global-chain-auto-secondary-017-26"], "chain_positions": {"global-chain-auto-secondary-017-26": 0}, "chain_tiers": {"global-chain-auto-secondary-017-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0296", "title": "OOM During Evaluation but Not Training", "topic": "activation-memory", "competency_area": "memory", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the model OOM during evaluation but not during training?", "chain_ids": ["global-chain-auto-secondary-017-59"], "chain_positions": {"global-chain-auto-secondary-017-59": 1}, "chain_tiers": {"global-chain-auto-secondary-017-59": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0297", "title": "Loss Plateau After Learning Rate Warmup", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the likely cause of the training loss plateau?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0298", "title": "Mysterious 15% Throughput Drop at Noon", "topic": "power-budgeting", "competency_area": "power", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is causing the consistent 15% throughput drop between 11 AM and 3 PM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0299", "title": "Replication vs Erasure Coding for Checkpoint Storage", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate storage overhead, write throughput, and recovery time for 3-way replication vs 10+4 Reed-Solomon erasure coding?", "chain_ids": ["global-chain-auto-secondary-017-07"], "chain_positions": {"global-chain-auto-secondary-017-07": 0}, "chain_tiers": {"global-chain-auto-secondary-017-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0300", "title": "Operational Intensity Ridge Point Calculation", "topic": "latency-decomposition", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Is a matrix multiplication layer with 100 FLOPs per byte memory-bound or compute-bound on an A100 80GB?", "chain_ids": ["global-chain-auto-secondary-010-06"], "chain_positions": {"global-chain-auto-secondary-010-06": 2}, "chain_tiers": {"global-chain-auto-secondary-010-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0301", "title": "FP16 Model VRAM and Load Time Estimation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Can a 7B FP16 LLM fit on a 16GB GPU, and what is the minimum time to read its weights once?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0302", "title": "INT8 Quantization Decoding Speed on H100", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the memory savings achieved by quantization and the maximum theoretical decoding speed per token, assuming inference is strictly memory-bandwidth bound?", "chain_ids": ["global-chain-auto-secondary-014-14"], "chain_positions": {"global-chain-auto-secondary-014-14": 1}, "chain_tiers": {"global-chain-auto-secondary-014-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0303", "title": "Tensor Parallelism GPU Count and NVLink Delay", "topic": "model-tensor-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How do you calculate the minimum number of GPUs required for weights, and what is the NVLink transfer time for a 12MB activation tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0304", "title": "Pipeline Bubble Delay Calculation", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the pipeline bubble delay added by transferring a 250MB intermediate activation tensor between the two nodes over the network?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0305", "title": "GPU Time Savings for Training Run", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many training hours are saved by using one A100 (312 TFLOPS peak) instead of one V100 (125 TFLOPS peak) for 10^21 FLOPs at 30% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0307", "title": "KV Cache Capacity Calculation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you determine the available memory for KV cache and calculate how many total tokens can be stored concurrently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0308", "title": "Offloaded Weights Latency Profiling", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the total time and identify the primary bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0309", "title": "SRAM Constrained INT8 Model Sizing", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the largest INT8 keyword-spotting model that can fit in the remaining SRAM of a Cortex-M4 device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0310", "title": "INT4 Token Generation Throughput", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does FP16 fail, and what is the max theoretical token generation throughput using INT4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0311", "title": "Pipeline Parallelism NVLink Transfer Delay", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the per-GPU weight memory footprint and compute the transfer delay if 150MB activations are sent between stages via NVLink?", "chain_ids": ["global-chain-auto-secondary-017-64"], "chain_positions": {"global-chain-auto-secondary-017-64": 0}, "chain_tiers": {"global-chain-auto-secondary-017-64": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0312", "title": "Ring All-Reduce Synchronization Delay", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long should a ring all-reduce take to synchronize a 2GB gradient tensor across exactly two nodes over IB NDR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0313", "title": "Cost Calculation for Continuous Generation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the exact dollar cost incurred to generate exactly 1.5 million tokens at continuous full load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0314", "title": "Datacenter Rack Power Draw Estimation", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What facility power does an 8x A100 server draw after adding host power and applying a PUE of 1.25?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0315", "title": "Batch Size and Token Generation Time", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you determine the maximum batch size, and calculate the memory-bound token generation time for that full batch?", "chain_ids": ["global-chain-auto-secondary-014-16"], "chain_positions": {"global-chain-auto-secondary-014-16": 1}, "chain_tiers": {"global-chain-auto-secondary-014-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0317", "title": "KV Cache Memory Sizing for 8B Model on 32GB V100", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What maximum context length can an 8B FP16 model support on a single 32 GiB V100 after reserving memory for its KV cache?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0318", "title": "Sizing Cluster Memory for 65B Parameter Model", "topic": "data-parallelism", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the total memory needed for optimizer states, gradients, and parameters, and determine the minimum number of GPUs required?", "chain_ids": ["global-chain-auto-secondary-017-06"], "chain_positions": {"global-chain-auto-secondary-017-06": 1}, "chain_tiers": {"global-chain-auto-secondary-017-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0319", "title": "Latency Comparison: PCIe Gen4 vs NVLink for KV Cache Offload", "topic": "roofline-analysis", "competency_area": "compute", "track": "global", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the latency difference between these two interconnect pathways for the full 40GB transfer?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0335", "title": "Topology-Aware Placement Rule", "topic": "interconnect-topology", "competency_area": "networking", "track": "global", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How should an inference scheduler place 8 model shards to avoid topology-induced communication bottlenecks?", "chain_ids": ["global-chain-auto-secondary-017-25"], "chain_positions": {"global-chain-auto-secondary-017-25": 2}, "chain_tiers": {"global-chain-auto-secondary-017-25": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0339", "title": "Data Validation Budget Optimization", "topic": "data-quality-validation", "competency_area": "data", "track": "global", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you reduce validation CPU cost by 80% while still catching schema regressions in 100M daily records?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0358", "title": "Extreme Quantization Acceptance Spec", "topic": "extreme-quantization", "competency_area": "precision", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What acceptance criteria should gate a 2-bit quantized model for mobile and edge release?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0362", "title": "Cross-Regime Recovery Budget", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design failure recovery for 2 TB cloud checkpoints, 20 MB mobile adapters, and 64 KB TinyML calibration state?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0374", "title": "Specify Overlap Acceptance Criteria", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What acceptance criteria should be specified before claiming communication-computation overlap?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0375", "title": "Cross-Regime Compute Cost Spec", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "global", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What inputs are required to estimate cost per successful interaction across cloud, mobile, and edge inference when retries can fail?", "chain_ids": ["global-chain-auto-secondary-017-61"], "chain_positions": {"global-chain-auto-secondary-017-61": 1}, "chain_tiers": {"global-chain-auto-secondary-017-61": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0378", "title": "Incomplete Compute Cost Procurement Decision", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "global", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "What information is missing before choosing buy, rent, or on-device inference, and what can be bounded from FLOPs and traffic alone?", "chain_ids": ["global-chain-auto-secondary-017-61"], "chain_positions": {"global-chain-auto-secondary-017-61": 2}, "chain_tiers": {"global-chain-auto-secondary-017-61": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0383", "title": "Fairness Metrics Across Product Surfaces", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "global", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What fairness evaluation plan stays comparable when cloud, mobile, and kiosk deployments expose different subgroup attributes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0397", "title": "Overlap Window Sizing", "topic": "communication-computation-overlap", "competency_area": "parallelism", "track": "global", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What communication time remains exposed if 240 ms of all-reduce can overlap with only 180 ms of backward compute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0401", "title": "Cluster Rollout Capacity With GPU Pods", "topic": "container-orchestration", "competency_area": "deployment", "track": "global", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Is draining 10% of 60 GPU pods safe at 1,350 rps if utilization must stay below 80%?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0421", "title": "Global Observability Signal Gap", "topic": "monitoring-observability", "competency_area": "reliability", "track": "global", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why can cloud metrics look healthy while mobile task success drops after the same model rollout?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0432", "title": "Evaluating Continuous vs Fixed Batching Queues", "topic": "queueing-theory", "competency_area": "latency", "track": "global", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether the endpoint can remain stable under an arrival rate of 100 req/s, and calculate the minimum average batch size required?", "chain_ids": ["global-chain-auto-secondary-017-65"], "chain_positions": {"global-chain-auto-secondary-017-65": 0}, "chain_tiers": {"global-chain-auto-secondary-017-65": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0434", "title": "Memory Bandwidth Bottleneck Analysis for Custom Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "global", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the arithmetic intensity of a 64x64 block load and calculate the required data reuse to saturate the tensor cores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0435", "title": "Heterogeneous Routing Algorithm for LLM Multi-Region Deployment", "topic": "queueing-theory", "competency_area": "latency", "track": "global", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a load-shedding and routing algorithm that minimizes p99 latency during unpredictable traffic surges without dropping requests?", "chain_ids": ["global-chain-auto-secondary-017-65"], "chain_positions": {"global-chain-auto-secondary-017-65": 1}, "chain_tiers": {"global-chain-auto-secondary-017-65": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0436", "title": "Evaluating Synchronous vs Asynchronous Checkpointing Trade-offs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "global", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate which checkpointing strategy yields higher goodput if a node fails exactly once every 10 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0437", "title": "Designing 3D Parallelism Topology for Inter-Node Bottlenecks", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "global", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create an optimal placement strategy for Tensor (TP), Pipeline (PP), and Data Parallelism (DP) to prevent the 400 Gbps links from starving the GPUs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0438", "title": "Evaluating Compute-Communication Overlap in MoE Layers", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "global", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate whether dividing the workload into 4 overlapping micro-steps completely hides the AllToAll communication latency behind compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "global-0440", "title": "Multi-Tenant LoRA Serving Architecture on MI300X", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "global", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you manage memory and request batching to serve all adapters without reloading the base model?", "chain_ids": ["global-chain-auto-secondary-011-32"], "chain_positions": {"global-chain-auto-secondary-011-32": 2}, "chain_tiers": {"global-chain-auto-secondary-011-32": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0001", "title": "The OTA Cellular Limit", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical size limit you must stay under for a cellular download that doesn't require explicit user opt-in?", "chain_ids": ["mobile-chain-auto-001-10"], "chain_positions": {"mobile-chain-auto-001-10": 0}, "chain_tiers": {"mobile-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20 MB", "2 GB", "~200 MB", "Unlimited, as long as the user has a data plan."], "correct_index": 2}}, {"id": "mobile-0004", "title": "The OTA Data Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the data consumption for a single user if you ship the full update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The update will only transfer the 40 MB difference, which is a manageable size.", "The 10% accuracy gain is worth the user data cost; we should ship the 120 MB update immediately.", "The update will consume 120 MB of the user's data plan, a significant cost we must address.", "The update is about 960 Megabits (Mb), which requires a Wi-Fi connection."], "correct_index": 2}}, {"id": "mobile-0005", "title": "The OTA Update Budget Shock", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What total carrier data cost results from rolling out a 350 MB model update to 5 million users at $2 per GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$35,000", "$350,000", "$3,500,000", "$1,750,000"], "correct_index": 2}}, {"id": "mobile-0006", "title": "The OTA Bandwidth Bottleneck", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following is the most critical physical constraint to consider for the update delivery mechanism?", "chain_ids": ["mobile-chain-auto-001-05", "mobile-chain-auto-001-06"], "chain_positions": {"mobile-chain-auto-001-05": 0, "mobile-chain-auto-001-06": 0}, "chain_tiers": {"mobile-chain-auto-001-05": "primary", "mobile-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["On-device flash storage capacity", "CPU cycles required for model decompression", "Unreliable, low-bandwidth cellular connectivity", "Power consumption during the download"], "correct_index": 2}}, {"id": "mobile-0008", "title": "The OTA Budget Constraint", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Will a 70M-parameter FP16 model fit under a 150 MB OTA update budget?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["280 MB, so it fails the requirement (Assuming FP32).", "70 MB, so it easily meets the requirement (Assuming INT8).", "140 MB, so it meets the requirement.", "17.5 MB, so it easily meets the requirement."], "correct_index": 2}}, {"id": "mobile-0010", "title": "The Conversion Cliff", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What went wrong during the seemingly successful PyTorch to CoreML conversion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0011", "title": "The App Store Privacy Rejection", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did Apple reject an on-device Core ML face-age filter for missing a face-data purpose string, and what must change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apple treats face-derived age estimation as sensitive face/biometric data regardless of processing location. Fix camera capture with NSCameraUsageDescription, disclose face/biometric processing in App Privacy details as applicable, and use an accurate permission flow. NSFaceIDUsageDescription is only for Face ID authentication.", "We don't collect face data — the model runs on-device, so there's no privacy issue. Apple's privacy framework doesn't distinguish between on-device processing and server-side collection for certain sensitive data categories.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance.", "Deploy the update to all devices simultaneously to minimize the total rollout duration and reduce the window of version inconsistency."], "correct_index": 0}}, {"id": "mobile-0012", "title": "The App Size Limit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can a 500 MB FP32 model ship in an app that must stay under the 200 MB cellular download limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0013", "title": "The App Store Model Size Rejection", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a 350 MB diffusion model be delivered to avoid the iOS 200 MB cellular download limit warning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0014", "title": "The ML Crash vs Silent Failure", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you detect and diagnose ML failures that don't crash the app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The accuracy degradation is caused by numerical instability in the framework's matrix multiplication kernels, not by the model or data.", "If there are no crashes, the model is working fine.. ML models fail silently — they return valid tensors filled with garbage instead of throwing exceptions.", "ML failures are invisible to standard crash reporting because the model always returns *something* — a valid tensor of zeros, random confidences, or stale results. You need ML-specific health monitoring.", "The system should be redesigned to offload inference to the cloud, as edge hardware fundamentally cannot meet the latency and accuracy requirements."], "correct_index": 2}}, {"id": "mobile-0015", "title": "The App Store ML Review Trap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a Core ML model that works on an iPhone 15 Pro crash on an older iPhone 8, and how must model delivery handle Apple's heterogeneous hardware matrix?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0016", "title": "The Feature Flag Footgun", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does enabling six ML features at once make a 6 GB RAM device kill apps and the launcher?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Each model is only 20-30 MB, so 6 models = 180 MB. That should easily fit in 6 GB RAM.", "The problem is cumulative ML runtime memory, not model file size. Each model's runtime footprint is 3-5x file size. Six models at ~100 MB runtime each = 600 MB, creating severe memory pressure.", "OTA updates should always include the full model file to ensure atomicity; delta updates risk corrupting the model.", "Moving to a memory-mapped file loading strategy will reduce peak memory to near-zero since only accessed weights are loaded."], "correct_index": 1}}, {"id": "mobile-0017", "title": "The ML Error That Looks Like a Feature", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an iOS update make portrait mode look more dreamy, and should the camera app ship or fix the change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Rollback capability is unnecessary if the model was validated in the cloud.", "If engagement is up by 5%, the model is working better — don't touch it.", "The safest strategy is to update firmware and model in a single atomic package to avoid version skew between them.", "Diagnose first: compare fixed pre-update and post-update inputs, tensors, and masks. If an iOS camera or preprocessing change caused uncontrolled blur-mask drift, roll back or normalize the pipeline before treating the dreamier look as a product experiment."], "correct_index": 3}}, {"id": "mobile-0018", "title": "The ML Notification Backlash", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does 78% precision make news notifications feel wrong at launch scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["78% precision at high volume is a UX disaster: 22% irrelevant x 40 notifications/day = 8.8 bad notifications, far exceeding user tolerance of 3-5 total. Fix: cap at 3 notifications/day, selecting highest-confidence predictions. Top-3 from 40 candidates achieves ~95% precision.", "The issue stems from OS-level scheduling interference; pinning the inference thread to a dedicated core will eliminate the performance variance.", "78% precision means 78% of notifications are relevant — that's pretty good.", "OTA updates should always include the full model file to ensure atomicity; delta updates risk corrupting the model."], "correct_index": 0}}, {"id": "mobile-0020", "title": "The Secure Enclave Boundary", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you protect the embedding in transit, and why can't you just run the ML model inside the Secure Enclave?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Edge-side health checks should focus on hardware metrics; model-level metrics are too noisy.", "The Secure Enclave has ~256 KB of memory. It cannot run a model requiring ~25 MB of working memory.", "Run the face verification model inside the Secure Enclave for end-to-end security.", "Model obfuscation through weight scrambling provides equivalent security to hardware-backed encryption."], "correct_index": 1}}, {"id": "mobile-0021", "title": "The Cold Start Jitter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What system-level factors contribute to this initial delay, and how would you optimize them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0022", "title": "The Offline-First ML Design", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design an offline-first architecture that bridges the accuracy gap within a strict on-device storage budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The memory leak is in the framework's autograd graph; disabling gradient computation with torch.no_grad() will fix it.", "Apply INT8 post-training quantization directly to ResNet-152 to shrink it from 230 MB to 57 MB to fit on-device.", "Design a tiered offline-first system: ship the domain-tuned 5.3 MB model on-device. This achieves a 43x size reduction and easily fits within the device's RAM and storage budgets.", "Prune the 230 MB model dynamically at runtime based on available device RAM."], "correct_index": 2}}, {"id": "mobile-0023", "title": "The On-Device Model Hot-Swap", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you push a model fix in under 1 hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Ship the model as a server-side config and swap it remotely.", "Model weights are executable code, taking 4 hours to recompile on device.", "The solution is to reduce model complexity until it fits within the hardware constraints.", "Build dynamic model delivery within platform guidelines using a CDN and atomic pointer swap."], "correct_index": 3}}, {"id": "mobile-0024", "title": "The Silent Accuracy Degradation", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can you detect six-month on-device classifier degradation when there are no server-side ground-truth labels?", "chain_ids": ["mobile-chain-auto-secondary-006-25"], "chain_positions": {"mobile-chain-auto-secondary-006-25": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Monitor accuracy using a held-out test set.. You don't have labels for on-device predictions — there's no test set.", "Network I/O is the dominant bottleneck; optimizing the model or inference pipeline will have minimal impact on end-to-end latency.", "The system should be redesigned to offload inference to the cloud, as edge hardware fundamentally cannot meet the latency and accuracy requirements.", "Four proxy signals detect degradation without ground truth: (1) confidence distribution shift from aggregated histograms, (2) prediction entropy summaries, (3) feature/input drift statistics, and (4) behavioral proxies such as clicks, retakes, or undo rate."], "correct_index": 3}}, {"id": "mobile-0025", "title": "The A/B Test Without Ground Truth", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you A/B test two on-device models when you can't observe the predictions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The key insight is that on mobile, the hardware itself generates the telemetry you need. Design an on-device A/B testing framework that uses hardware-level signals as proxy metrics when ground truth is unavailable.", "Send all predictions to the server for analysis. This defeats the purpose of on-device inference (privacy, latency) and may violate your privacy policy.", "The primary issue is insufficient hardware capability; upgrading to next-generation hardware will resolve the performance gap without software changes.", "The solution is to reduce model complexity until it fits within the hardware constraints, accepting the accuracy trade-off as unavoidable."], "correct_index": 0}}, {"id": "mobile-0026", "title": "The On-Device Model Encryption Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can Core ML and TFLite on-device model bundles be protected after a competitor extracts them in 10 minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Defense-in-depth with hardware-backed protection: Core ML encryption (requires Secure Enclave exploit) and Android split architecture.", "Encrypt the model file with AES-256 and decrypt it at runtime. This protects the file at rest, but the decrypted model must live in memory during inference.", "Use ProGuard or DexGuard to obfuscate the app's Java/Swift code, which automatically encrypts the bundled assets like ML models.", "Deploy the models as dynamic libraries (.so or .dylib) instead of flatbuffers, which prevents them from being reverse-engineered."], "correct_index": 0}}, {"id": "mobile-0027", "title": "The Mobile ML Telemetry Budget", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you design the telemetry system to keep ML telemetry for 5M DAU under a $500/month budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0028", "title": "The Model Rollback Nightmare", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you roll back only Samsung Galaxy S21 Android 13 users after a 40% no-detection spike?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0029", "title": "The ML Crash Report Black Hole", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose and fix a rare kernel crash without source access or local reproduction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0030", "title": "The App Size Audit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can you find at least 60 MB of app size savings while preserving all four mobile ML features?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0031", "title": "The Mobile ML CI/CD Pipeline", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should CI/CD stop PyTorch-to-Core ML/TFLite handoffs from shipping a wrong-dataset model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0032", "title": "The Model Cache Eviction Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 30 MB photo-enhancement model in iOS Caches keep disappearing and taking 12s to reload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0033", "title": "The Device-Free ML Testing Strategy", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a camera ML team cover 200M iOS and Android users with only 5 local test phones?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0034", "title": "The Cross-Version Compatibility Maze", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ship one app that works optimally across all these OS versions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Target the lowest common denominator — use only features available on iOS 16 and Android 10.", "Model tiering keyed to runtime capabilities, not OS version. Probe available APIs at launch: CoreML version, TFLite delegate support, quantization compatibility.", "Bundle all three model variants (48 MB total) to ensure immediate offline availability without runtime downloads.", "Use ONNX Runtime exclusively to bypass OS-level APIs and guarantee identical execution across all versions."], "correct_index": 1}}, {"id": "mobile-0035", "title": "Layered Orthogonal Assignment for Concurrent Mobile Experiments", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do 12 concurrent mobile ML experiments interfere across 2M DAU, and how should assignment be isolated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use layered or orthogonal assignment so mutually interacting experiments do not overlap for the same user, while safe layers use independent hash namespaces and remain balanced by device class and OS.", "Run each experiment on a separate user cohort. With 12 experiments and 2M DAU, each cohort gets ~167K users.", "The bottleneck is in the software framework, not the hardware; switching to a lower-level API will achieve near-theoretical-peak performance.", "Parallelizing pipeline stages across CPU and GPU always improves throughput, regardless of the relative speeds of each stage."], "correct_index": 0}}, {"id": "mobile-0036", "title": "Core ML Inference Regression After App Update", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What non-model regression could explain identical model binaries becoming slower after a mobile app update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0037", "title": "The Production Mobile ML Observability Stack", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the observability system that tells you, within 5 minutes, if any model is degrading on any device segment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0038", "title": "The Delivery Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What breaks at this scale that doesn't break in a demo?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 4}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The safest strategy is to update firmware and model in a single atomic package to avoid version skew between them.", "The OOM is caused by memory fragmentation in the Python runtime; running garbage collection before inference will prevent it.", "Just use on-demand download after install — it's under 2 GB. This works for a prototype but collapses at 500M-device scale.", "At 500M installs, demo-safe on-demand delivery breaks because repeated large transfers become a CDN, storage, and reliability problem. Use optional asset delivery for the initial 1.7 GB model, resumable downloads, mmap-friendly packaging, staged rollout, and delta patches; a full update costs about $8.5M, while a 6% delta is still about $510K per update."], "correct_index": 3}}, {"id": "mobile-0039", "title": "The Multi-Model Orchestration Nightmare", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you orchestrate pose, speech, and gesture models concurrently on one mobile SoC without missing real-time budgets?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0040", "title": "The User Consent Minefield", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does on-device federated learning require explicit user consent under GDPR even if data never leaves the device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0042", "title": "The Federated Keyboard", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What training approach would you use so the keyboard model learns from user typing while raw keystrokes never leave each device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0043", "title": "On-Device LoRA Personalization for CLIP Photo Search", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you size and manage LoRA personalization for an on-device CLIP-like photo search model regarding memory, time, and migration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0044", "title": "The Keyboard Prediction Privacy Leak", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can top-5 keyboard predictions over 1000 queries leak private text even when the model stays on-device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0045", "title": "The On-Device Differential Privacy Budget", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can federated health training fit 100 rounds under a yearly differential privacy budget of epsilon 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0047", "title": "Designing Federated Learning for a 500M DAU Social Media Feed", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a federated learning system for a social media app with 500 million daily active users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0049", "title": "The Privacy-Utility Squeeze", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What should you tell the PM when epsilon 2 differential privacy makes federated next-word prediction 40% worse, and how can you improve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0052", "title": "The 50-Feature Mobile ML Platform", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a super-app with 50 ML features fit on 3 GB phones without shipping 50 separate full models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0053", "title": "The Model Loading I/O Cliff", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is there a 23x difference in load time, and how do you fix the user experience on budget Android phones?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0054", "title": "The Cellular Download Wall", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a platform distribution perspective, what is the most critical, immediate problem with this approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large to fit into the device's RAM during runtime.", "It exceeds the ~200 MB cellular download limit, preventing users from installing the app without Wi-Fi.", "The 300 MB app size will take up too much of the user's total phone storage.", "Large app binaries significantly increase the time it takes for app store review and approval."], "correct_index": 1}}, {"id": "mobile-0058", "title": "The ANR Timeout", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the name of this event, and what is the standard time limit that is being exceeded?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Jank Event, 16 ms", "ANR Event, 5 seconds", "System Crash, 1 second", "ANR Event, 30 seconds"], "correct_index": 1}}, {"id": "mobile-0059", "title": "The 60 FPS Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the maximum permissible latency for your model's inference to avoid causing UI 'jank', and what does 'jank' mean in this context?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 1}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["33.33ms.", "16.67ms.", "Significantly less than 16.67ms, because the model shares the frame budget with UI rendering and other logic.", "66.67ms."], "correct_index": 2}}, {"id": "mobile-0060", "title": "The Synchronous Inference ANR", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the exact user experience on an Android device when that button is tapped?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The app will feel unresponsive for 6 seconds, then continue normally.", "The app will immediately crash due to an out-of-memory error.", "The UI will freeze, and after 5 seconds, an 'Application Not Responding' (ANR) dialog will appear.", "The OS will automatically restart the app after detecting the long-running task."], "correct_index": 2}}, {"id": "mobile-0061", "title": "The App Store Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can an 8 GB smartphone safely A/B test a 1.5 GB generative model when the app already uses 400 MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it's fine. The 1.5 GB model is much smaller than the device's 8 GB of RAM.", "Yes, it should be okay. The app's total memory budget is 2 GB, and the 1.5 GB model fits within that.", "No, it's too risky. The total required memory of 1.9 GB (1.5 GB model + 0.4 GB app) leaves almost no headroom within the ~2 GB per-app budget.", "No, it's impossible. The 1.5 GB model alone is larger than the app's base memory of 400 MB."], "correct_index": 2}}, {"id": "mobile-0063", "title": "The Cold Start Problem", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What creates a 500ms first-inference delay when later mobile ML inferences take only 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0064", "title": "The Battery Blame Game", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If ML averages only 20 mW, what drains 1% battery per minute in the workout app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0065", "title": "The Model Warm-up on Mobile", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the first CoreML inference on iPhone 15 Pro 100-250x slower than the 8ms steady-state path?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0066", "title": "The Accessibility Conflict", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does VoiceOver read 'image' before an iPhone 15 on-device description model finishes social feed inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0067", "title": "Smart Reply Accessibility Label Race Condition", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What went wrong with the accessibility integration for the on-device ML smart replies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0069", "title": "The App Store Binary Size Limit", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can a 60 MB iOS app deliver an 800 MB INT8 image-generation model without exceeding the 200 MB cellular limit?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0072", "title": "Snapdragon 8 Gen 3 Shared Resource Contention", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do camera detection and LLM decoding slow down together on Snapdragon 8 Gen 3 despite separate compute units, and how do you fix it?", "chain_ids": ["mobile-chain-auto-019-05"], "chain_positions": {"mobile-chain-auto-019-05": 0}, "chain_tiers": {"mobile-chain-auto-019-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0073", "title": "The Display Pipeline Collision", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 60 FPS AR segmentation mask lag visibly when the iPhone display switches to 120 Hz?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0074", "title": "The Concurrency Collision", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can two simultaneous NPU models each fall to 30% of solo FPS instead of sharing performance evenly (50%)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0075", "title": "The Janky Background App", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What common mobile systems issue is likely at play, and how would you try to identify the culprit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0076", "title": "The NPU Efficiency Advantage", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the NPU achieve 3x better TOPS/W for the same workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0077", "title": "The Photo Segmentation Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the latency breakdown for Pixel Portrait Mode, and which stage is the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0078", "title": "The Model Update Delta Compression", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Design a delta update system, calculate the expected patch size, and explain the unique challenge for quantized models?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0081", "title": "The Camera Preview Stutter: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the beauty filter stutter every 2-3s when inference is a stable 8ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0082", "title": "The Hardware Decoder Synchronization", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is fundamentally wrong with putting hardware decoding in the critical path of a synchronous loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0083", "title": "The Double JPEG Decode Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Where does the 140ms overhead in image picking come from, and how can it be bypassed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0084", "title": "The CoreML Model Compilation Jitter", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the first Core ML initialization freeze the UI for 3 seconds when the bundled .mlmodel starts the camera?", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0085", "title": "The Android NNAPI Driver Fallback", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is NNAPI making the model 16x slower than it should be on a specific device, and how do you fix it?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0086", "title": "The Thermal Cliff", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is happening, and why can't you just 'push through it'?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0087", "title": "The Zero-Copy Imperative", "topic": "extreme-quantization", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do memory copies affect a real-time mobile video pipeline, and what zero-copy design should replace them?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0088", "title": "The Adaptive Bitrate Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What thermal-aware precision policy keeps detection under 16 ms when the NPU throttles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0089", "title": "Optimal Heterogeneous Graph Execution", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the execution strategy to minimize overall latency and power consumption on a mobile SoC with these heterogeneous compute units?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0090", "title": "The Inference Timing Jitter Mystery", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the same computation take 3x longer sometimes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0091", "title": "The Cross-Platform ML Runtime Decision", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you choose between native runtimes, ONNX Runtime, and a hybrid stack for cross-platform mobile inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0092", "title": "The 1000-Device Android Fragmentation Problem", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ship one model across 1000+ Android devices that works reliably and efficiently?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0093", "title": "The Cross-Platform Model Optimization", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many optimized model binaries do you actually need, and what does the build pipeline look like?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0094", "title": "The Audio Pipeline Latency Creep", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 50 ms acoustic anomaly model alert nearly a second late when Android AudioRecord captures 1 s chunks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0095", "title": "The Battery Accounting Inversion", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does cutting NPU inference from 10 ms to 5 ms with INT8 increase feature battery drain by 15%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0096", "title": "The Adaptive Power Maestro", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a system to dynamically adapt the SoC's operating parameters to meet both the performance and power constraints?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 5}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0097", "title": "The OS Scheduler's Dilemma", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design scheduler and QoS controls for concurrent mobile vision and audio inference?", "chain_ids": ["mobile-chain-auto-019-05"], "chain_positions": {"mobile-chain-auto-019-05": 2}, "chain_tiers": {"mobile-chain-auto-019-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0098", "title": "The Async Camera Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you architect this so the user sees smooth video with accurate segmentation?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 5}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0099", "title": "The Hidden Broadcast Receiver Wake-Ups", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why do 10 SMS inferences taking 2 ms each make an Android phishing detector a top 1% battery drainer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0100", "title": "The Double FPU Context Save", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What mechanism makes these floating-point preprocessing and inference thread switches 3x slower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0101", "title": "The Sustained vs Burst Reality", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did ten 12ms benchmark runs miss the two-minute thermal collapse of a 60 FPS AR filter?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 2}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0102", "title": "The Cellular Modem Power Surprise", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "You're uploading 50 bytes per second — how can that halve battery life?", "chain_ids": ["mobile-chain-bucket-powerbud-06"], "chain_positions": {"mobile-chain-bucket-powerbud-06": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0103", "title": "The Throttling Treadmill", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does object detection latency rise from 20 ms to 60 ms after 30 seconds of continuous on-device use?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0104", "title": "The Silent Battery Drain", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What common pitfalls in mobile ML background processing lead to excessive battery drain, and how can you mitigate them?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0105", "title": "The Battery Drain Dilemma", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why might a GPU use more peak power but complete inference faster than a CPU, and which is better for battery life?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0106", "title": "The Thermal Throttling Death Spiral", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the game halve framerate after 15 minutes even with 50% CPU/GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0107", "title": "The Accelerometer Inference Power", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can you keep total power under 5 mW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0108", "title": "The \"Warm-Up\" Performance Drop", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the primary cause for this sustained performance drop, and how is it confirmed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0109", "title": "The TDP Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 45 TOPS chip tie with a 35 TOPS chip, and what does this tell you about evaluating mobile SoCs for ML?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0110", "title": "The Thermal Throttling Trap", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 15ms Android AR filter slow to 40ms after five minutes on a mobile SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0111", "title": "The DVFS Polling Delay", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the same inference 2x faster while the user is actively scrolling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0112", "title": "The Background ML Battery Drain", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a tiny MobileNetV3 photo tagger with only 600 ms of total inference compute drain 15% of the battery overnight?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0113", "title": "The Power Hungry Framework", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What factors explain the 20-30% higher power draw of TFLite on Android vs Core ML on iOS, and how can it be optimized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0114", "title": "The Pocket Furnace", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 3B INT4 assistant overheat a mobile device during a 10-minute chat, and how can it stay responsive while remaining within the thermal budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0115", "title": "The Background ML Battery Vampire", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 0.3 ms model drain 15% of a 4355 mAh battery?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0116", "title": "The Power Domain Juggling Act", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does adding a tiny vision model to the low-power DSP inflate the system's power budget by over 100x?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0117", "title": "The Sustained Performance Cliff", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the most likely cause, and how would you design your ML system to provide more sustained performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0118", "title": "The Background Thermal Throttling", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is the background execution 9x slower than the foreground execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0119", "title": "The Background Inference Power Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many inferences can you run per background cycle, and what is the energy cost of running on NPU vs CPU?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0120", "title": "The Thermal Throttling Prediction", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a style-transfer app stay smooth when thermal throttling doubles latency after 45s?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 4}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0121", "title": "The Profiling Tool Blind Spot", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do TFLite benchmark, Systrace, and Perfetto report 8ms, 12ms, and 15ms for the same inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0122", "title": "The iOS vs Android ML Framework Maze", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is there a 2x performance gap, and how do you close it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0123", "title": "The Dynamic Shape Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does dynamic shape support cost so much, and when is padding to a fixed shape the better strategy despite wasting compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0124", "title": "CoreML RoPE Unsupported Op Conversion", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is simply removing the RoPE layer a bad idea, and how should unsupported ops be handled in CoreML?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0125", "title": "The CoreML Conversion Black Hole", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose and fix a silently broken Core ML conversion of a custom GLU operation in a 45-layer PyTorch model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0126", "title": "The Dynamic Shape Recompilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is CoreML doing during those extra 13 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0127", "title": "The WebGPU ML Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "When is WebGPU a reasonable alternative to a native mobile ML SDK for a 50M-parameter image classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0128", "title": "The Cellular Model Download Failure", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does a 180 MB first-launch model download fail for 40% of India and Southeast Asia users on 4G?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0129", "title": "The Big.LITTLE Task Migration", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a CPU hand-tracking model jump from 8 ms to 30 ms after 2 minutes without thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0131", "title": "The Camera VSync Deadlock", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 12ms ML stage with 4.6ms of headroom still drop a 60Hz camera pipeline to 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0132", "title": "mmap Page Fault Contention Stalls the UI", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is a background memory read stalling the foreground UI?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0136", "title": "The NPU Energy Advantage", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which standard mobile SoC processor type is most energy-efficient for these operations?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPU, because it has the highest parallel throughput (TOPS).", "The CPU, because it avoids the latency overhead of memory transfers to another processing unit.", "The NPU, because its specialized hardware for low-precision integer math is vastly more energy-efficient.", "The NPU, but it's only slightly more efficient (e.g., 1.5-2x) than the CPU."], "correct_index": 2}}, {"id": "mobile-0138", "title": "The CPU Fallback Energy Tax", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately how much more energy-efficient is the INT8 version compared to the FP32 version for that specific operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x more efficient", "~2x more efficient", "~18x more efficient", "~100x more efficient"], "correct_index": 2}}, {"id": "mobile-0141", "title": "The Mobile Thermal Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the most likely physical limit the System-on-a-Chip (SoC) is encountering, and what is its approximate power value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The SoC is hitting a software power limit of around 30 Watts.", "The model is likely hitting a memory bandwidth limit of a few hundred milliwatts.", "The SoC is thermal throttling, hitting its sustainable power budget of 3-5 Watts.", "The operating system is de-prioritizing the app due to a 3W memory leak over 60 seconds."], "correct_index": 2}}, {"id": "mobile-0142", "title": "The Fusion Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When a framework like Core ML or TensorFlow Lite applies operator fusion, what is the primary hardware cost it aims to reduce?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The total number of arithmetic computations (FLOPs).", "The model's storage footprint on the device's flash memory.", "The latency and power cost of writing intermediate tensors to main memory (DRAM).", "The time spent delegating unsupported operators to the CPU."], "correct_index": 2}}, {"id": "mobile-0143", "title": "The Privacy Wall", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason you must use a privacy-preserving technique like federated analytics instead of simply logging raw camera images and sending them to your servers?", "chain_ids": ["mobile-chain-auto-secondary-006-25"], "chain_positions": {"mobile-chain-auto-secondary-006-25": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The daily cellular data usage would be too expensive for users.", "The constant network requests would drain the device's battery too quickly.", "Uploading raw user data is a major privacy violation and breaks user trust.", "The on-device storage is insufficient to buffer the image logs before uploading."], "correct_index": 2}}, {"id": "mobile-0144", "title": "The Privacy Memory Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a hardware resource perspective, what is the most significant new constraint introduced by adding on-device DP to the training process?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased CPU usage from the cryptographic noise generation.", "Increased network bandwidth to send the larger, noisy model updates.", "A major increase in peak RAM to store per-example gradients.", "Increased flash storage needed to save the privacy-preserving model."], "correct_index": 2}}, {"id": "mobile-0146", "title": "The Cellular Download Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What cellular download size limit should mobile ML model delivery usually be designed around?", "chain_ids": ["mobile-chain-bucket-modelser-01"], "chain_positions": {"mobile-chain-bucket-modelser-01": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~50 MB", "~200 MB", "14 GB", "No limit, it depends on the user's data plan"], "correct_index": 1}}, {"id": "mobile-0147", "title": "The NPU Fallback Power Tax", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the new per-inference power draw when 20% of an NPU workload falls back to a CPU that is 10x less efficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["120 mW (Mistake: Assumes a simple linear 20% power increase)", "200 mW (Mistake: Calculates only the power for the CPU portion)", "280 mW", "1000 mW (Mistake: Applies the 10x penalty to the entire original power draw)"], "correct_index": 2}}, {"id": "mobile-0149", "title": "The NPU Fallback Memory Cost", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total memory in megabytes (MB) required to store the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10.0 MB", "5.0 MB", "9.5 MB", "5.5 MB"], "correct_index": 2}}, {"id": "mobile-0151", "title": "The Mobile Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum duty cycle percentage the model can be active to stay within the 5W thermal budget?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["71%", "100%", "50%", "29%"], "correct_index": 2}}, {"id": "mobile-0152", "title": "The Operator Fusion Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much latency does fusing a Conv2D and ReLU save when it removes one dispatch and one memory round-trip?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1,000 ns (Ignores memory round-trip cost)", "5,000 ns (Ignores kernel dispatch overhead)", "6,000 ns (Correctly sums dispatch and memory overhead)", "0 ns (Assumes no latency benefit since FLOPs are unchanged)"], "correct_index": 2}}, {"id": "mobile-0153", "title": "Federated Averaging's Blind Spot", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How can this seemingly small global drop hide a significant problem, and what is the actual accuracy degradation for the affected 20% of users?", "chain_ids": ["mobile-chain-auto-secondary-006-25"], "chain_positions": {"mobile-chain-auto-secondary-006-25": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The accuracy for affected users dropped by 1%. The problem is being exaggerated.", "The accuracy for affected users dropped by 20%. The model is completely broken for them.", "The accuracy for affected users dropped by 5%. A significant degradation is being masked by the fleet average.", "It's impossible to tell the local drop without knowing the new local accuracy value directly."], "correct_index": 2}}, {"id": "mobile-0154", "title": "The Federated Learning Upload Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary communication bottleneck for the user, and what is the data payload size uploaded per training round?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 MB", "20 MB", "10 MB", "40 MB"], "correct_index": 2}}, {"id": "mobile-0155", "title": "The Battery Drain A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much extra daily energy does Model B consume if it uses 1W more than Model A for 10 minutes per day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.0 W", "600 Wh", "~0.17 Wh", "~0.67 Wh"], "correct_index": 2}}, {"id": "mobile-0156", "title": "The Delta Update Dilemma", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the approximate size of a delta patch to upgrade the model, and does it solve the OTA limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~170 MB", "~45 MB", "~175 MB", "220 MB"], "correct_index": 2}}, {"id": "mobile-0157", "title": "The 85% Delegation Fallacy", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can 15 CPU fallback operations dominate latency even when 85 of 100 operations run on the NPU?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The overall performance will be roughly 85% of the NPU's peak performance.", "The CPU is the bottleneck; the 15 unsupported ops take 45x longer than the 85 NPU ops.", "The NPU is the bottleneck because it executes 85% of the total operations.", "The CPU portion is negligible because the NPU is over 250x faster than the CPU."], "correct_index": 1}}, {"id": "mobile-0158", "title": "The App Memory Guillotine", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What safe memory budget should an Android ML feature target on a phone with 8 GB of RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 6-7 GB, since the OS only needs about 1-2 GB.", "Approximately 2 GB.", "Around 4 GB, which is half of the total RAM.", "Unlimited, as long as the user isn't running other apps."], "correct_index": 1}}, {"id": "mobile-0159", "title": "The Unified Memory Trap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental hardware difference that makes deploying a 14 GB model on a 12 GB smartphone infeasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The phone's NPU is not powerful enough to run a 7B model.", "The 12 GB of RAM is unified and shared with the OS, leaving a much smaller budget for the app.", "The phone's memory bandwidth is too low to handle the model's weights.", "The 14 GB model fits perfectly in the 12 GB RAM using virtual memory swapping without penalty."], "correct_index": 1}}, {"id": "mobile-0161", "title": "The Jank Instigator", "topic": "real-time-deadlines", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Interpret this situation: what happens to the frame rate, and what is the approximate new FPS?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 1}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The game will crash because the SoC is drawing too much power.", "The frame rate will drop to 30 FPS as the system cuts performance by half.", "The frame rate will remain at 60 FPS, but the phone will get dangerously hot.", "The frame rate will drop to approximately 50 FPS as the SoC scales down performance."], "correct_index": 3}}, {"id": "mobile-0162", "title": "The A17 Pro Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the A17 Pro ridge point from 35 TOPS and 51.2 GB/s mean for mobile model optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.68 Ops/Byte. This is a unit-confusion error (forgetting the 1000x difference between Tera and Giga).", "~1.46 Ops/Byte. This is an inversion error (dividing bandwidth by compute) and misinterprets the result.", "~684 Ops/Byte. This high value means most neural network layers will be compute-bound on the A17 Pro.", "~684 Ops/Byte. Layers must have an arithmetic intensity greater than this to be compute-bound."], "correct_index": 3}}, {"id": "mobile-0164", "title": "The App Budget Constraint", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Using half-precision floating point (FP16), what is the memory footprint of just the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["750 MB", "1.5 GB", "3.0 GB", "12 GB"], "correct_index": 1}}, {"id": "mobile-0165", "title": "The Mobile LLM's Memory Hog", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory is needed for 1024 tokens in a 20-layer, hidden-size-512 mobile LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20 MB", "40 KB", "40 MB", "80 MB"], "correct_index": 2}}, {"id": "mobile-0167", "title": "The Quantization Energy Dividend", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "About how much more energy-efficient is an INT8 MAC than an FP16 MAC for mobile inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x more efficient (linear scaling with bit-width)", "About 5x more efficient", "About 18x more efficient (assuming FP32 vs INT8 scaling)", "The savings are negligible (<10%) due to conversion overhead"], "correct_index": 1}}, {"id": "mobile-0168", "title": "The App Store Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary precision trade-off and new memory footprint when quantizing a 150M parameter model from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75 MB. Quantization from 32-bit floats provides a 4x reduction, so the 300 MB model becomes 75 MB.", "300 MB. Quantization to INT8 doesn't affect the stored weight size, only the precision of calculations during inference.", "150 MB. Each 2-byte FP16 parameter is reduced to a 1-byte INT8 parameter, halving the memory.", "1.2 GB. The model has 1.2 billion bits for its weights (150M x 8), which is roughly 1.2 GB."], "correct_index": 2}}, {"id": "mobile-0170", "title": "The Illusion of Symmetric Scaling", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do doubling model width and doubling input resolution each change FLOPs for a 5 GFLOP mobile vision model?", "chain_ids": ["mobile-chain-auto-secondary-010-09"], "chain_positions": {"mobile-chain-auto-secondary-010-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both strategies result in ~10 GFLOPs.", "Width scaling: ~20 GFLOPs; Resolution scaling: ~10 GFLOPs.", "Both strategies result in ~20 GFLOPs.", "Width scaling: ~10 GFLOPs; Resolution scaling: ~20 GFLOPs."], "correct_index": 2}}, {"id": "mobile-0171", "title": "The Mobile Jank Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth, 60 FPS user experience, what is the absolute maximum latency your model inference can have before the user perceives jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 ms (Standard web latency target)", "~33 ms (30 FPS budget)", "~16 ms", "~1 ms (Audio processing target)"], "correct_index": 2}}, {"id": "mobile-0172", "title": "The Duty Cycle Power Trap", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a 1-hour period, which model drains more battery?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model A (the 5W model)", "Model B (the 3W model)", "They drain the same amount", "It's impossible to tell without knowing the device's battery capacity"], "correct_index": 1}}, {"id": "mobile-0174", "title": "The Voice Assistant Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Will an audio inference queue stay stable if chunks arrive every 100ms and processing takes 90ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is unstable because at 90% utilization, there is no margin for error and the queue will eventually overflow.", "The system is stable because the processing time (90ms) is less than the arrival interval (100ms).", "The system is unstable because the workload (900ms per second) causes immediate thermal throttling.", "The system is stable but will have an infinite queue delay because 90ms leaves no time for memory IO."], "correct_index": 1}}, {"id": "mobile-0175", "title": "The Cost of a Glance", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate average power consumption of this feature over its 10-second cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 W", "2.5 W", "500 mW", "50 mW"], "correct_index": 2}}, {"id": "mobile-0178", "title": "The Fusion Overhead Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What percent overhead does an unfused Conv-ReLU memory roundtrip add for a 1 MB tensor at 51.2 GB/s?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.7% overhead. Fusion saves writing the intermediate tensor to DRAM.", "0% overhead. The memory access time is negligible compared to compute.", "~3.5% overhead. Fusion saves a DRAM write/read roundtrip.", "~22% overhead. The memory bandwidth is the main bottleneck."], "correct_index": 2}}, {"id": "mobile-0179", "title": "The 7B Parameter Illusion", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "As the mobile ML systems engineer, what minimum FP16 memory footprint should you cite to ground the conversation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~7 GB", "~28 GB", "~14 GB", "~2 GB"], "correct_index": 2}}, {"id": "mobile-0180", "title": "The OTA Memory Budget", "topic": "compound-ai-systems", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain to the junior engineer, using napkin math, whether this update is safe to roll out from a memory perspective?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Unsafe. The new 1.0 GB model added to the 1.2 GB base usage is 2.2 GB, which exceeds the 2.0 GB budget.", "Unsafe. A 1B parameter model requires 2.0 GB of memory (using 2 bytes/param), which is the entire budget.", "Safe. The memory increase is only 0.5 GB, bringing the new peak to 1.7 GB, which is under the 2.0 GB budget.", "Safe. The OS will use memory mapping, so the model's size on disk doesn't count against the app's RAM budget."], "correct_index": 2}}, {"id": "mobile-0181", "title": "The Frozen UI Watchdog", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is a standard, low-level timer mechanism used to proactively detect this frozen state and trigger a faster, more graceful recovery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A software watchdog timer", "Relying on the OS 'Application Not Responding' (ANR) timeout", "A network-based health check to a remote server", "Wrapping the inference call in a try/except block"], "correct_index": 0}}, {"id": "mobile-0184", "title": "The Camera Pre-processing Skew", "topic": "mlops-lifecycle", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the primary risk of this pipeline and calculate the approximate data reduction factor between a single training frame and a single serving frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A ~14x reduction. This is a typical and acceptable trade-off for mobile performance.", "A ~330x reduction. This skew is caused by forgetting to account for the 3 color channels in the serving image.", "A ~110x reduction. This causes training-serving skew because the on-device preprocessing artifacts are not in the training data.", "A ~880x reduction. This level of compression is too high and indicates a miscalculation in bit-to-byte conversion."], "correct_index": 2}}, {"id": "mobile-0185", "title": "The Two-Billion Dollar Keystroke", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why can Federated Learning be economically justified for keyboard prediction despite higher direct implementation costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The $10M/year cost increase is not justifiable as it triples the project's budget.", "FL saves significant network costs by not having to upload petabytes of user data.", "The potential cost of a data breach far exceeds the implementation cost difference.", "FL provides better model accuracy through on-device personalization."], "correct_index": 2}}, {"id": "mobile-0187", "title": "The Mobile Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a layer doing 50 GOps while reading 500 MB memory-bound on an A17 Pro, and what roofline math proves it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The ANE's 35 TOPS is the limiting factor.", "Memory-bound. The layer's arithmetic intensity is ~100 Ops/Byte, which is significantly lower than the A17 Pro's ridge point of ~683 Ops/Byte.", "Compute-bound. The layer's arithmetic intensity is ~100 Ops/Byte, which is high enough to saturate the processor.", "Neither. The workload is balanced because the amount of data and compute are both large."], "correct_index": 1}}, {"id": "mobile-0188", "title": "The Mobile Memory Chasm", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much slower is a random read from the phone's UFS flash storage compared to its main LPDDR5 DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash is ~1,000× slower than DRAM.", "Flash is ~10× slower than DRAM.", "They have similar latency, within 2-3× of each other.", "Flash is actually faster for sequential burst reads, making it superior for models."], "correct_index": 0}}, {"id": "mobile-0189", "title": "The Mobile KV-Cache Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory is required for a 24-layer, 16-head LLM with 4096 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~192 MB", "~384 MB", "~768 MB", "~96 MB"], "correct_index": 1}}, {"id": "mobile-0190", "title": "The TinyML vs Mobile Memory Arena", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can a 250 KB keyword model that fits a Cortex-M7 tensor arena still face memory constraints on a 12 GB phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile app can use all 12GB of RAM, making it over 48,000x larger. (Trap: Assuming no OS caps)", "The mobile app's budget is orders of magnitude larger (~12,000x), but it's higher-latency DRAM managed by an OS, unlike the microcontroller's dedicated high-speed SRAM.", "The memory is the same type (RAM), so the only difference is the amount available. (Trap: Missing SRAM vs DRAM difference)", "The mobile OS overhead consumes most of the RAM, so the actual available memory is similar to the TinyML device. (Trap: Numerically false)"], "correct_index": 1}}, {"id": "mobile-0191", "title": "The Quantization Memory Payoff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the final memory requirement for a 7B parameter model's weights after quantizing from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "3.5 GB", "7 GB", "700 MB"], "correct_index": 2}}, {"id": "mobile-0195", "title": "The SoC Power Draw", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When the model is actively processing audio, what is a realistic power consumption value for the phone's System-on-a-Chip (SoC)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~30 W", "~4 W", "~15 W", "~1.5 W"], "correct_index": 1}}, {"id": "mobile-0197", "title": "The Background Battery Drainer", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming negligible power consumption during sleep, what percentage of a 19Wh phone battery does a 0.5-second, 3W task every 10 seconds use in 24 hours?", "chain_ids": ["mobile-chain-bucket-powerbud-02"], "chain_positions": {"mobile-chain-bucket-powerbud-02": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-02": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Over 100%. The feature consumes 72Wh, which isn't possible.", "Around 3.2%. The feature is active for about 0.2 hours per day.", "Around 18.9%. The feature is active for 1.2 hours and consumes 3.6Wh.", "Around 6.3%. The duty cycle is 1/60, leading to 1.2Wh of consumption."], "correct_index": 2}}, {"id": "mobile-0198", "title": "The AR Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What duty cycle can a 5W AR feature sustain when idle power is 1W and passive cooling dissipates 3W?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["60%. The phone can dissipate 3W of the 5W active power.", "100%. The SoC can handle 5W without issue.", "40%. The excess heat generated is 2W, which is 40% of the active power.", "50%. The weighted average of active and idle power must equal the 3W dissipation rate."], "correct_index": 3}}, {"id": "mobile-0199", "title": "The OS Kill Switch: Mobile Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What maximum total app memory budget should you target on an 8 GB smartphone to avoid OS termination?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["6 GB", "8 GB", "2 GB", "500 MB"], "correct_index": 2}}, {"id": "mobile-0200", "title": "The On-Device RAG Budget", "topic": "compound-ai-systems", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory do the FP16 weights of a 1B parameter model require, and does it fit within a standard mobile app budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It needs 1 GB, so it fits easily.", "It needs 2 GB, which is much less than the 8 GB device RAM, so it fits easily.", "It needs 2 GB, which consumes the entire 25% app budget, making it too risky.", "It needs 4 GB, so it won't fit."], "correct_index": 2}}, {"id": "mobile-0201", "title": "The Watchdog's Power Tax: Monitoring & Observability", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy, in Watt-hours (Wh), consumed only by the watchdog process over a 24-hour period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12 Wh", "0.72 Wh", "0.12 Wh", "432 Wh"], "correct_index": 2}}, {"id": "mobile-0202", "title": "The Hidden Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the single largest operational cost introduced by choosing the Federated Learning design at scale?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 0}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased on-device compute and battery drain.", "Server compute cost for aggregating model updates.", "Network communication overhead from gradient uploads.", "Storage costs for the global model on the server."], "correct_index": 2}}, {"id": "mobile-0203", "title": "The Economics of Privacy: Centralized vs. Federated", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What monthly cloud ingress cost results from 1M users uploading 5 MB of training data per day at $0.01 per GB?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 1}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$50", "$500", "$1,500", "$15,000"], "correct_index": 2}}, {"id": "mobile-0204", "title": "The Mobile Roofline Limit", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a layer with 100 Ops/Byte memory-bound or compute-bound on an NPU with 35 TOPS and 51.2 GB/s bandwidth?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 Ops/Byte; Compute-bound", "683 Ops/Byte; Compute-bound", "100 Ops/Byte; Memory-bound", "683 Ops/Byte; Memory-bound"], "correct_index": 2}}, {"id": "mobile-0205", "title": "The Mobile KV-Cache Memory Trap", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory does a 32-layer, 4096-hidden LLM need for a 4096-token context on mobile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1 GB", "4 GB", "2 GB", "256 MB"], "correct_index": 2}}, {"id": "mobile-0206", "title": "The On-Device Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Explain whether the model's weights will fit into this budget using FP16 precision, and contrast this with using INT8 precision?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both will fit easily within the 8 GB of device RAM.", "Neither will fit; the model is too large for on-device deployment.", "It will exceed the budget in FP16, but fit using INT8.", "Both require the same 1.5 GB of memory, so both will fit."], "correct_index": 2}}, {"id": "mobile-0207", "title": "The MobileNet Parameter Diet", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much does a 3x3 depthwise separable convolution reduce parameters for 128 input and 256 output channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No change in parameters, only latency improves", "~2x reduction", "~256x reduction", "~9x reduction"], "correct_index": 3}}, {"id": "mobile-0208", "title": "The Two Latencies of Generative AI", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What standard industry terms define these two critical performance metrics in generative models?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 0}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Initial Latency and Inference Speed", "Cold Start and Warm Read", "P50 Latency and P99 Latency", "Time to First Token (TTFT) and Time Per Output Token (TPOT)"], "correct_index": 3}}, {"id": "mobile-0212", "title": "The Watchdog's Battery Bill", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What is the total energy cost in Watt-hours (Wh) for watchdog recovery events over an 8-hour drive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5.33 Wh", "0.0028 Wh", "0.089 Wh", "5.0 W"], "correct_index": 2}}, {"id": "mobile-0213", "title": "The Federated Learning Cost Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a single training round, which of the following operations is the primary driver of the user's cost in terms of battery consumption and data-plan usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Local model training computation on the NPU (~1 Joule).", "Reading the local training data from UFS flash storage (~0.1 Joule).", "Transmitting the computed model update over the cellular network (~10 Joules).", "The overhead from the A/B testing framework that selected the user (~0.01 Joule)."], "correct_index": 2}}, {"id": "mobile-0215", "title": "The Mobile VFX Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of this layer, and is it compute-bound or memory-bound on an A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound; 1.4 Billion operations is a heavy workload that will saturate the ANE's compute units.", "Compute-bound; the AI is about 200 Ops/Byte, which exceeds the A17 Pro's ridge point.", "Memory-bound; the AI is 200 Ops/Byte, which is less than the A17 Pro's ridge point of ~683 Ops/Byte.", "Compute-bound; the AI is 200 Ops/Byte, and any AI over 100 is typically considered high enough to be compute-bound."], "correct_index": 2}}, {"id": "mobile-0216", "title": "The Mobile Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the memory requirements and calculate the model's final storage size in megabytes after full INT8 quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7.5 MB", "30 MB", "15 MB", "60 MB"], "correct_index": 2}}, {"id": "mobile-0217", "title": "The Depthwise Separable Dividend: Transformer Systems Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much computational reduction should replacing a standard 3x3 convolution with a depthwise separable convolution provide?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It provides no computational reduction, it only saves memory.", "A ~3x reduction.", "A ~9x reduction.", "A ~2x reduction."], "correct_index": 2}}, {"id": "mobile-0219", "title": "The App Memory Diet", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a reasonable rule-of-thumb memory budget for your entire feature, including the model and runtime activations, to avoid being terminated by the OS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["6 GB (Fails to account for OS footprint)", "8 GB (Total theoretical RAM)", "256 MB (Unnecessarily restrictive)", "2 GB"], "correct_index": 3}}, {"id": "mobile-0220", "title": "The 16ms UI Jank Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To avoid UI jank, what is the maximum number of tokens you can generate synchronously before you must implement a background-threaded generation strategy?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 1}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["32 tokens", "60 tokens", "8 tokens", "16 tokens"], "correct_index": 2}}, {"id": "mobile-0221", "title": "The 'Ambient Assistant' Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What percentage of the total battery capacity will this feature consume over a 24-hour period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~32%", "~39%", "~164%", "~320%"], "correct_index": 1}}, {"id": "mobile-0222", "title": "The Runaway Inference Battery Drain", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How would you calculate the approximate battery energy consumed by a single runaway inference that lasts for 60 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["300 Wh. (Calculated Joules but used Wh unit)", "5 Wh. (Used Watt rating directly as Wh)", "≈ 0.083 Wh. This is a significant battery drain for a single failed operation", "8.3 Wh. (Forgot to divide by full 3600)"], "correct_index": 2}}, {"id": "mobile-0224", "title": "The Battery Cost of Federated Personalization", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the total energy consumed by this 15-minute daily federated learning feature per user over a 30-day period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.75 Wh", "3.0 W", "22.5 Wh", "2160 Wh"], "correct_index": 2}}, {"id": "mobile-0226", "title": "The App Store Diet: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If you quantize the model's weights to INT8, what is the new size of the model on disk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 MB", "32 MB", "4 MB", "2 MB"], "correct_index": 2}}, {"id": "mobile-0227", "title": "The Mobile Roofline Riddle", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of the style transfer model, and is it memory-bound on this NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 Ops/Byte; Memory-bound, as its AI is far below the device's ridge point.", "140 Ops/Byte; Compute-bound, as this is a high operational intensity.", "684 Ops/Byte; Compute-bound, as the model's intensity will match the device's.", "17.5 Ops/Byte; Memory-bound, from incorrectly converting bytes to bits."], "correct_index": 0}}, {"id": "mobile-0228", "title": "The Mobile Memory Squeeze", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Does quantizing the 750M parameter FP16 model to INT8 allow it to fit within the 1 GB memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["FP16 is 750 MB, INT8 is 375 MB. Both fit.", "FP16 is 3.0 GB, INT8 is 1.5 GB. Neither fits.", "FP16 is 1.5 GB, INT8 is 750 MB. The INT8 model fits.", "FP16 is 1.5 GB, INT8 is 1.5 GB. There are no memory savings."], "correct_index": 2}}, {"id": "mobile-0231", "title": "NPU vs. Reality", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Given the A17 Pro's peak performance of 35 TOPS, what is the theoretical maximum frame rate (FPS) you could achieve, ignoring all memory, OS, and framework overhead?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20 FPS", "5 FPS", "500 FPS", "50 FPS"], "correct_index": 3}}, {"id": "mobile-0232", "title": "The Mobile Generative UI Latency Trap", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For time-to-last-result, is it faster to process 3 sequential 150ms edits or one 300ms batch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["450ms (no-batch) vs. 450ms (batch); batching is always better.", "150ms (no-batch) vs. 300ms (batch); no-batching is better for throughput.", "450ms (no-batch) vs. 300ms (batch); batching is faster for time-to-last-result.", "450ms (no-batch) vs. 225ms (batch); batching time scales linearly."], "correct_index": 2}}, {"id": "mobile-0233", "title": "The Drowsy Driver's Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy in Watt-hours (Wh) this feature will consume from the phone's battery over the course of an 8-hour road trip?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 Wh", "0.2 Wh", "1.6 Wh", "1.8 Wh"], "correct_index": 2}}, {"id": "mobile-0234", "title": "The 25% Mobile Memory Rule", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Before checking the model's specific memory footprint, what is the first-order, rule-of-thumb application memory budget you should recall for a typical high-end smartphone with 8GB of RAM?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 0}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8 GB", "4 GB", "2 GB", "256 MB"], "correct_index": 2}}, {"id": "mobile-0236", "title": "The Million-Car Update", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When evaluating the Total Cost of Ownership (TCO) for this single experiment, which of the following costs should you identify as the most significant and immediate financial factor?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The $5,000 cloud compute cost to retrain the model.", "The $50,000 cellular data cost to download the 50MB model update.", "The $500,000 fleet-wide cellular data cost to download the 50MB model update.", "The $5,000,000 increased battery consumption from running a larger model."], "correct_index": 2}}, {"id": "mobile-0237", "title": "The Fleet-Level Cost of an 'Always-On' Mobile Feature", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the annual energy cost across a fleet of 1 million cars if the feature is used 1.5 hours per day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$186.15", "$186,150,000", "$186,150", "$2,978,400"], "correct_index": 2}}, {"id": "mobile-0239", "title": "The INT8 Memory Diet", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the memory savings in megabytes (MB) for the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["450 MB", "300 MB", "150 MB", "75 MB"], "correct_index": 2}}, {"id": "mobile-0240", "title": "The Depthwise Efficiency Factor", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate reduction in computational cost (FLOPs) you'd expect to see in those layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2-3x (Trap: Underestimating quadratic scaling)", "Roughly 8-9x", "Roughly 4-5x (Trap: Miscalculating K squared)", "It's the same, but memory is reduced (Trap: Confusing params with FLOPs)"], "correct_index": 1}}, {"id": "mobile-0241", "title": "The First-Word Latency Test", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which metric is most critical to minimize for this initial perception of responsiveness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT)", "Time To First Token (TTFT)", "Model Loading Time", "Total Generation Time for the sequence"], "correct_index": 1}}, {"id": "mobile-0244", "title": "The 7 Billion Parameter Car Crash", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the typical hardware specs for mobile-class systems, what is the most immediate, fundamental blocker for this plan?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU's TOPS limit will cap inference at 1 token/sec.", "The OTA download exceeds cellular bandwidth SLAs.", "The 7 GB INT8 footprint physically exceeds the system's available RAM.", "The context window will max out the KV cache in 2 turns."], "correct_index": 2}}, {"id": "mobile-0245", "title": "The OTA Storage Budget", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak storage is required during a rollback-safe OTA update from a 500 MB model to a 1.8x larger v2 model?", "chain_ids": ["mobile-chain-auto-001-05"], "chain_positions": {"mobile-chain-auto-001-05": 1}, "chain_tiers": {"mobile-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1600 MB. The final size is well within the limit.", "1700 MB. The peak is the initial size plus the old model being duplicated.", "2400 MB. The update will temporarily violate the 2 GB storage limit.", "1950 MB. The update is large but stays just within the limit."], "correct_index": 2}}, {"id": "mobile-0246", "title": "The Battery Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate power consumption of a modern smartphone's System-on-a-Chip (SoC) when it's actively running an ML workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~50 milliwatts", "~30 Watts", "~3–5 Watts", "~700 Watts"], "correct_index": 2}}, {"id": "mobile-0247", "title": "The Privacy vs. Battery-Life Tax", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which uses less energy for mobile A/B testing: 60s of 3W federated learning or a 3s, 5W cellular upload?", "chain_ids": ["mobile-chain-auto-secondary-008-03"], "chain_positions": {"mobile-chain-auto-secondary-008-03": 1}, "chain_tiers": {"mobile-chain-auto-secondary-008-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Cloud approach is much costlier (~1500 Joules), as uploading data is always expensive.", "B) Both approaches consume roughly the same amount of energy (~150-180 Joules).", "C) The FL approach is much costlier (~180 Joules vs ~15 Joules for the cloud).", "D) The energy cost cannot be compared, as one is compute (Watts) and the other is data (MB)."], "correct_index": 2}}, {"id": "mobile-0248", "title": "The A17 NPU Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is an A17 Pro layer with 50 GOps and 200 MB of memory traffic memory-bound or compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the required 50 Giga-ops exceeds the capacity of the NPU.", "Memory-bound, because the memory bandwidth number (51.2) is much smaller than the compute number (35,000).", "Memory-bound, because its arithmetic intensity (250 Ops/Byte) is below the A17's ridge point (~683 Ops/Byte).", "Compute-bound, because its arithmetic intensity (250 Ops/Byte) is high, indicating a heavy compute load."], "correct_index": 2}}, {"id": "mobile-0249", "title": "The On-Device Memory Diet: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the memory requirements for both precisions and calculate the total memory savings in gigabytes?", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 0}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 GB", "3.5 GB", "7 GB", "21 GB"], "correct_index": 2}}, {"id": "mobile-0250", "title": "The MobileNet Multiplier: Transformer Systems Cost", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary benefit you recall this change provides?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly improves model accuracy.", "It provides a >100x reduction in computation.", "It reduces memory by 4x by using INT8 precision.", "It provides a roughly 9x reduction in computation and parameters."], "correct_index": 3}}, {"id": "mobile-0252", "title": "The Real-Time Translation Jank", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the model's token generation latency to the UI frame budget to determine the source of the jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30 tokens/sec is high throughput; the model isn't the bottleneck.", "The 150ms TTFT causes a 9-frame initial drop; subsequent tokens are fast enough.", "At ~33.3ms per token, synchronous inference blocks the main UI thread for over two 16ms frames.", "The UI thread is blocked for 16ms, creating a 50% inference timeout."], "correct_index": 2}}, {"id": "mobile-0253", "title": "The Background Battery Drain Dilemma", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can you calculate the total energy this feature consumes from the battery over one hour of continuous operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.0 Wh", "0.1 W", "0.1 Wh", "10.0 Wh"], "correct_index": 2}}, {"id": "mobile-0254", "title": "The OTA Update Space Crunch", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much free space remains during an atomic OTA update when the framework, v1 model, compressed v2, and uncompressed v2 coexist?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["648 MB", "150 MB", "198 MB", "798 MB"], "correct_index": 2}}, {"id": "mobile-0255", "title": "The Mobile Power Chasm", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate active-inference-to-deep-sleep power ratio for a mobile SoC?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 0}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100x", "1,000x", "500,000x", "10x"], "correct_index": 2}}, {"id": "mobile-0256", "title": "The Federated Learning Battery Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much annual energy does one 5-minute, 3W federated learning round per day consume on a user's phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["15 Wh. (Trap: 3W * 5 min)", "1,095 Wh. (Trap: 3W * 365 without hour conversion)", "91.25 Wh.", "2190 Wh. (Trap: Wrong unit base entirely)"], "correct_index": 2}}, {"id": "mobile-0257", "title": "The Mobile NPU Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 2 GOps, 10 MB object detection inference memory-bound on an A17 Pro, and what throughput does that imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity is over 200,000,000 Ops/Byte, which easily saturates the ANE.", "Memory-bound, because its Arithmetic Intensity of 200 Ops/Byte is less than the A17's ridge point of ~683 Ops/Byte.", "Compute-bound, because 35 TOPS is a massive amount of performance, and the model is relatively small.", "It's impossible to tell without knowing the power efficiency in TOPS/W for this specific model."], "correct_index": 1}}, {"id": "mobile-0258", "title": "The Depthwise Separable Efficiency Gain", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate reduction in computational cost (FLOPs) you should expect from this change?", "chain_ids": ["mobile-chain-auto-secondary-010-09"], "chain_positions": {"mobile-chain-auto-secondary-010-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2-3× reduction (assuming standard grouped convolution scaling)", "No significant reduction (assuming memory bounds negate compute savings)", "~8-9× reduction", "~27× reduction (incorrectly cubing the 3x3 kernel dimension)"], "correct_index": 2}}, {"id": "mobile-0259", "title": "The 'Instant Reply' Metric", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For the user to perceive the suggestion as 'instantaneous', what key metric must be minimized, and what is its approximate target value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT), targeting ~50ms", "Frame Rendering Latency, targeting ~16ms", "Time To First Token (TTFT), targeting ~100ms", "Throughput, targeting >20 tokens/second"], "correct_index": 2}}, {"id": "mobile-0260", "title": "The On-Device Assistant's First Word", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 7B INT8 assistant load its 7 GB weights over 77 GB/s LPDDR5X within a 200 ms TTFT budget?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 1}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~182 ms. Yes, but it's very close to the budget limit.", "~727 ms. No, this is far too slow and will feel laggy.", "~91 ms. Yes, this is well within the 200ms budget.", "~11 ms. Yes, it's extremely fast, leaving plenty of budget."], "correct_index": 2}}, {"id": "mobile-0264", "title": "The Mobile Style Transfer Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is a mobile style transfer CNN with arithmetic intensity below the A17 Pro ridge point memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound; the model's Arithmetic Intensity is ~288 Ops/Byte, and since this is a high number, the compute units must be the bottleneck.", "Memory-bound; the model's Arithmetic Intensity is 0.2 Ops/Byte, which is far too low.", "Memory-bound; the model's Arithmetic Intensity is ~288 Ops/Byte, which is below the A17 Pro's hardware ridge point of ~683 Ops/Byte.", "Compute-bound; the model requires 0.005 Bytes/Op, meaning very little data is needed per operation, so the bottleneck must be compute speed."], "correct_index": 2}}, {"id": "mobile-0265", "title": "The INT8 Memory Halving", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the final memory footprint when quantizing the 50 million parameter model from FP16 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["25 MB, a 4x reduction.", "12.5 MB, an 8x reduction.", "50 MB, a 2x reduction.", "100 MB, no change."], "correct_index": 2}}, {"id": "mobile-0266", "title": "The Activation Memory Footprint", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Assuming the model processes one frame at a time (batch size 1) and uses half-precision floating point (FP16), what is the memory required to store this single activation map?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 KB", "~512 KB", "~1 MB", "~10 MB"], "correct_index": 2}}, {"id": "mobile-0267", "title": "The Voice Assistant's First Word", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the user perceives the assistant as immediately responsive after they ask a question, which of the following latency metrics is the most critical to minimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Time Per Output Token (TPOT)", "Total generation time for the full response", "Time to First Token (TTFT)", "NPU delegation ratio"], "correct_index": 2}}, {"id": "mobile-0269", "title": "The Dashcam Drain Dilemma", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the average power consumption and estimate how long the battery will last running this feature?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4.6 hours", "~9.2 hours", "~18.4 hours", "~23.1 hours"], "correct_index": 2}}, {"id": "mobile-0271", "title": "The Federated Energy Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why can Federated Learning consume more total system energy than centralized training at fleet scale?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized, because a 700W datacenter GPU uses far more power than a 5W phone.", "They are roughly equal; the powerful server's work balances out the distributed work of many weak devices.", "Federated Learning, because the aggregate energy consumed by millions of phones exceeds the server's energy budget.", "Centralized, because transmitting raw data from millions of devices to the server consumes the most energy."], "correct_index": 2}}, {"id": "mobile-0274", "title": "The Depthwise Separable Advantage", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary advantage of making this change?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly increases model accuracy by adding more layers.", "It only reduces model size (parameters), but compute cost (FLOPs) remains the same.", "It significantly reduces both computation (FLOPs) and parameters (model size).", "It enables models to be quantized to INT8, which is not possible with standard convolutions."], "correct_index": 2}}, {"id": "mobile-0276", "title": "The Jank Budget Fallacy", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can either the single-token or batched approach stream tokens to the UI without causing jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The batched approach is better because the throughput is higher.", "Both approaches are effectively the same since the average time per token is 30ms.", "Neither approach works; both violate the 16ms deadline, and batching increases perceived latency.", "The single-token approach works; 30ms is fast enough for a mobile device."], "correct_index": 2}}, {"id": "mobile-0279", "title": "The Federated Learning Litmus Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason to use Federated Learning for driver-drowsiness video data instead of central collection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To reduce network bandwidth costs from uploading terabytes of video data.", "To achieve higher final model accuracy compared to centralized training.", "To protect user privacy by not collecting or centralizing raw video data.", "To simplify the A/B testing process for new model architectures across the fleet."], "correct_index": 2}}, {"id": "mobile-0280", "title": "The Economics of On-Device Learning: Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the daily data transfer costs for centralized versus federated training across 1M mobile users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized costs $100,000/day; Federated costs $400/day.", "Centralized costs $100/day; Federated costs $4/day.", "Centralized costs $100,000/day; Federated costs $4,000/day.", "Centralized costs $102,400/day; Federated costs $4,096/day."], "correct_index": 2}}, {"id": "mobile-0284", "title": "The Mobile UI Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To maintain a smooth 60 FPS user experience and avoid UI 'jank', what is the approximate latency budget for a single inference call that needs to run on the main thread before the next frame is drawn?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 0}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1 ms", "~16 ms", "~33 ms", "~100 ms"], "correct_index": 1}}, {"id": "mobile-0285", "title": "The Real-Time Dashcam Dilemma", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many objects can your system theoretically detect in a single frame before dropping below the real-time deadline, assuming 2 Giga-Ops per object?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["17 objects", "280,000 objects", "280 objects", "1 object"], "correct_index": 2}}, {"id": "mobile-0287", "title": "The Autopilot OTA Budget", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 60M-parameter FP16 model fit within a strict 150 MB cellular OTA payload limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the payload requires 240 MB (FP32), which exceeds the 150 MB limit.", "Yes, the payload requires 60 MB (INT8), well under the limit.", "Yes, the payload requires roughly 130 MB (FP16 + metadata), fitting the limit.", "No, parameter count alone cannot determine the binary payload size."], "correct_index": 2}}, {"id": "mobile-0288", "title": "The Drowsy Driver's Dilemma", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which of the following factors is the MOST critical lever for managing the total battery energy consumed by this feature over the entire shift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The peak power draw of the inference (Pinference)", "The amount of RAM the model's activations consume", "The duty cycle of the inference process", "The speed of the phone's flash storage (UFS 4.0)"], "correct_index": 2}}, {"id": "mobile-0289", "title": "The Battery Drain Tax", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Compare the daily battery energy cost of Model A to Model B?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B consumes about 0.0042 Wh per day, roughly 1.5 times more than Model A's 0.0028 Wh.", "Model B consumes 3 Wh per day, while Model A consumes 0.5 Wh. It's a 6x difference.", "Model B consumes 1 Wh per day (60 Ws / 60), which is significantly more than Model A.", "Model B consumes 600 Wh per day, making it completely infeasible for a mobile device."], "correct_index": 0}}, {"id": "mobile-0290", "title": "The Mobile Roofline Dilemma: Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the model's Arithmetic Intensity and determine if the workload is compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["140 OPS/Byte; Compute-bound", "683 OPS/Byte; Compute-bound", "140 OPS/Byte; Memory-bound", "683 OPS/Byte; Memory-bound"], "correct_index": 2}}, {"id": "mobile-0291", "title": "The INT8 Memory Payoff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the impact on the model's static memory footprint and calculate the new size in megabytes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["14 MB", "7 MB", "3.5 MB", "28 MB"], "correct_index": 1}}, {"id": "mobile-0292", "title": "The Cost of Depthwise Separable Convolutions", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a typical layer using a 3x3 kernel, approximately how much computationally cheaper is it to replace a standard convolution with a 3x3 depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x cheaper", "Roughly the same cost, it just saves memory", "About 9x cheaper", "About 128x cheaper (scales with channels)"], "correct_index": 2}}, {"id": "mobile-0294", "title": "The Real-Time Batching Trap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is batching real-time dashcam frames counter-productive when camera frames arrive every 33.3ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It fails. The latency for the first frame is 73.3ms because it waits for the second frame.", "It works. The average latency is 20ms (40ms/2), well under the 33.3ms deadline.", "It fails. The 40ms batch processing time exceeds the 33.3ms deadline.", "It works. Two frames give 66.6ms budget, and 40ms processing fits."], "correct_index": 0}}, {"id": "mobile-0297", "title": "The Energy Cost of Precision", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much less energy does a single INT8 arithmetic operation consume compared to a single FP32 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x", "~4x", "~18x", "~100x"], "correct_index": 2}}, {"id": "mobile-0299", "title": "The A17 Pro's Arithmetic Intensity", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is this specific layer compute-bound or memory-bound based on its arithmetic intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity of 350 Ops/Byte is much greater than the A17's ridge point of ~0.68 Ops/Byte.", "Compute-bound, because the workload's Arithmetic Intensity (350 Ops/Byte) is less than the hardware's ridge point (~684 Ops/Byte).", "Memory-bound, because the workload's Arithmetic Intensity (350 Ops/Byte) is less than the A17 Pro's ridge point (~684 Ops/Byte).", "Memory-bound, because 2 GB/s is a large amount of data, which always indicates a memory bottleneck on mobile."], "correct_index": 2}}, {"id": "mobile-0300", "title": "The App Size Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the model's memory footprint in its original FP16 format, and what is its footprint after full INT8 quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["600 MB (FP16) vs. 150 MB (INT8)", "300 MB (FP16) vs. 150 MB (INT8)", "2.4 GB (FP16) vs. 1.2 GB (INT8)", "150 MB (FP16) vs. 75 MB (INT8)"], "correct_index": 1}}, {"id": "mobile-0303", "title": "The Live Caption Queueing Delay", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average queue length results when 30 FPS frames take 25ms each to process on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.0 frames", "0.75 frames", "0 frames, because service time is less than the arrival interval", "2.25 frames"], "correct_index": 3}}, {"id": "mobile-0304", "title": "The 'Always-On' Battery Drain Dilemma", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much energy does an always-on keyword feature consume over 24 hours with 100ms active at 3W each second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["72 Wh (Confuses peak power with average power)", "7.2 Wh (Ignores the idle power consumption)", "~8.3 Wh", "~29.8 Wh (Incorrectly converts Joules to Watt-hours)"], "correct_index": 2}}, {"id": "mobile-0305", "title": "The OTA Budget Bust", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory will the model parameters alone require for inference, assuming it's stored in FP16 precision?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["7 GB. (Trap: Assuming INT8 weights at 1 byte/param)", "112 GB. (Trap: Assuming 16 bytes/param for full training states)", "14 GB.", "28 GB. (Trap: Assuming FP32 inference at 4 bytes/param)"], "correct_index": 2}}, {"id": "mobile-0306", "title": "The Energy Cost of Data Movement", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much more energy does a DRAM access cost than one FP16 operation on a mobile ML accelerator?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~1x (Trap: assuming compute and data fetch are roughly equivalent).", "B) ~10x (Trap: assuming standard software memory hierarchy costs).", "C) ~100x (Trap: underestimating the physical trace distance to LPDDR).", "D) ~580x (Correct)."], "correct_index": 3}}, {"id": "mobile-0307", "title": "The On-Device Battery Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What percentage of a full battery charge is consumed by a single FL training round?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 1}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10.0%", "5.0%", "2.5%", "1.0%"], "correct_index": 1}}, {"id": "mobile-0309", "title": "The 25% Rule for Mobile Memory", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What ML feature memory budget follows from the 25% rule on an 8 GB flagship smartphone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["6 GB", "4 GB", "2 GB", "512 MB"], "correct_index": 2}}, {"id": "mobile-0310", "title": "The Mobile Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth, 'buttery' 60 FPS user experience, what is the approximate latency budget for your entire ML inference pipeline per frame before a user will perceive 'jank'?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 0}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100ms (Typical cloud service P99 latency)", "~33ms (Standard 30 FPS video deadline)", "~16ms (60 FPS 'jank' deadline)", "~1ms (TinyML interrupt latency)"], "correct_index": 2}}, {"id": "mobile-0311", "title": "The On-Device Jank Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What token throughput results when a mobile generative model's 25ms TPOT is slower than the UI frame deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~63 tokens/sec. The throughput is determined by the 16.67ms UI refresh deadline.", "~24 tokens/sec. This is derived from the sum of the latencies (16.67ms + 25ms).", "40 tokens/sec. The throughput is bottlenecked by the 25ms token generation time.", "~111 tokens/sec. This is derived from the difference in latencies (25ms - 16.67ms)."], "correct_index": 2}}, {"id": "mobile-0314", "title": "The Core Premise of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason you'd state for using Federated Learning in this mobile context?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It produces more accurate models by specializing on each user's data.", "It keeps raw user data on the device, enhancing user privacy.", "It reduces network bandwidth costs by sending small gradient updates instead of large datasets.", "It allows for faster overall model training compared to centralized methods."], "correct_index": 1}}, {"id": "mobile-0315", "title": "The Federated vs. Centralized Data Cost", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What daily centralized upload cost results from 1M users sending 10 MB each at $0.09 per GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$900,000 per day", "$90,000 per day", "$900 per day", "$9 per day"], "correct_index": 2}}, {"id": "mobile-0316", "title": "The AR Filter's Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 70 GOPS, 200 MB convolution layer on the Neural Engine compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because its Arithmetic Intensity (350 Ops/Byte) is less than the A17 Pro's Ridge Point (~683 Ops/Byte).", "Memory-bound, because its Arithmetic Intensity is ~0.0028 Bytes/Op, which is very low.", "Compute-bound, because 35 TOPS is a very high compute capability, so the layer will always be limited by the accelerator's speed.", "Memory-bound, because its Arithmetic Intensity (350 Ops/Byte) is less than the A17 Pro's Ridge Point (~683 Ops/Byte)."], "correct_index": 3}}, {"id": "mobile-0317", "title": "The App Memory Budget: Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What maximum ML feature memory budget is reasonable on a smartphone with 8 GB of RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["8.0 GB (100% Allocation)", "4.0 GB (50% Allocation)", "2.0 GB (25% Allocation)", "0.5 GB (6% Allocation)"], "correct_index": 3}}, {"id": "mobile-0319", "title": "The On-Device Reader's Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What minimum TPOT is imposed by reading 1 GB of INT8 LLM weights over 77 GB/s mobile memory bandwidth?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 0}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~104 ms", "~0.04 ms", "~13 ms", "~1.3 ms"], "correct_index": 2}}, {"id": "mobile-0320", "title": "The Dashcam's Duty Cycle", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the average power consumption of a dashcam pipeline that runs at 5W for 1s and sleeps at 100mW for 9s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0.50 W", "2.55 W", "0.59 W", "4.51 W"], "correct_index": 2}}, {"id": "mobile-0321", "title": "The Cellular Data Bill Shock", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming these updates happen over cellular networks, how much data must be delivered for a 10% rollout of a 75 MB model to 1M active users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["750 GB", "7.5 TB", "75 TB", "7.5 GB"], "correct_index": 1}}, {"id": "mobile-0322", "title": "The Mobile Memory Budget", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To avoid having your app terminated by the operating system, what is the generally accepted maximum memory budget for a single application relative to the total device RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75% — The OS allocates most of the memory to the foreground app.", "50% — The memory is split evenly between the app and the OS.", "25% — A conservative budget to ensure stability alongside the OS and other processes.", "5% — The OS is highly restrictive, leaving very little for any single app."], "correct_index": 2}}, {"id": "mobile-0323", "title": "The Federated Learning Subsidy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do centralized and federated personalization data subsidies compare for 1M keyboard users at $10 per GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The costs are identical ($100,000/day), so the choice depends on privacy, not economics.", "Centralized costs $10,000/day and Federated costs $1,000/day; Centralized is only 10x more expensive.", "Centralized costs $100,000/day and Federated costs $1,000/day; Federated is 100x cheaper.", "The costs are negligible for both, totaling less than $100 per day."], "correct_index": 2}}, {"id": "mobile-0326", "title": "The UI Jank Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth, 'buttery' user experience and avoid visual stutter (UI jank), what is the generally accepted maximum latency budget for the model to process a single frame?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 0}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100ms (Cloud P99 Latency)", "33ms (30 FPS Video Deadline)", "16ms (60 FPS UI Jank Budget)", "1ms (TinyML Interrupt Budget)"], "correct_index": 2}}, {"id": "mobile-0327", "title": "The On-Device Autocomplete Queue", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can autocomplete keep up when each 6-token suggestion takes 450ms but requests arrive every 400ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it can keep up. The decoding time is only 250ms, which is less than the 400ms arrival time.", "Yes, it can keep up. The first token appears in 200ms, which is fast enough to feel responsive.", "No, it cannot keep up. The total service time is 450ms per request, but new requests arrive every 400ms.", "It depends on the priority of the UI thread."], "correct_index": 2}}, {"id": "mobile-0328", "title": "The Driver Drowsiness Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy (in Watt-hours) this feature consumes over an 8-hour driving shift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["40.0 Wh", "4.0 Wh", "4.36 Wh", "0.55 Wh"], "correct_index": 2}}, {"id": "mobile-0329", "title": "The Energy Cost of Privacy", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the fundamental physical reason why sending small, trained model updates from the device is often more energy-efficient than uploading the raw user data to a central server?", "chain_ids": ["mobile-chain-auto-secondary-008-03"], "chain_positions": {"mobile-chain-auto-secondary-008-03": 0}, "chain_tiers": {"mobile-chain-auto-secondary-008-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cellular data plans are expensive, so it primarily saves the user money.", "Central servers have more powerful GPUs, leading to faster training overall.", "Wireless data transmission is significantly more energy-intensive per byte than on-chip computation.", "On-device computation is too slow, so uploading is the only realistic option for training."], "correct_index": 2}}, {"id": "mobile-0330", "title": "The Mobile TOPS Illusion", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does a 10 GOps AR filter reading 200 MB fail to achieve a mobile NPU's advertised 35 TOPS peak?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 1}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound, so its latency is ~0.28 ms.", "The model is memory-bound because its AI is less than the hardware's Ridge Point.", "The model is compute-bound because its AI is high.", "The model is memory-bound, so its latency is determined by compute."], "correct_index": 1}}, {"id": "mobile-0331", "title": "Depthwise Separable Convolutions: FLOPs Reduction", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate reduction in computational cost (FLOPs) you would expect?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2x", "~9x", "~28x", "~256x"], "correct_index": 1}}, {"id": "mobile-0334", "title": "The Background Battery Killer", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming a standard 4000 mAh (3.8V) battery, what percentage of the battery will this feature consume in 24 hours?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3%", "~11% (Ignores 24-hour accumulated idle drain)", "~14%", "~40%"], "correct_index": 2}}, {"id": "mobile-0335", "title": "The Battery Tax of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the most critical resource consumed during on-device training that risks user churn?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 0}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Network bandwidth for model updates.", "On-device flash storage for the model.", "User battery life.", "A 1.1% RAM reduction causing memory swapping."], "correct_index": 2}}, {"id": "mobile-0336", "title": "Federated Learning vs Centralized Video Upload Cost for a 100k-Vehicle Fleet", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary economic argument for using Federated Learning, and what is the data transfer cost for the centralized approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized upload is cheaper because 250 MB per vehicle is too small to affect a fleet-scale cellular bill.", "FL is cheaper primarily because it eliminates all on-device compute and battery costs.", "FL is economically attractive because it avoids a $250,000 centralized raw-video transfer cost for each dataset refresh.", "Centralized training is cheaper because Secure Aggregation makes FL model updates larger than raw video clips."], "correct_index": 2}}, {"id": "mobile-0337", "title": "The Style Transfer Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 50 GOps layer moving 200 MB memory-bound or compute-bound on an A17 Pro (35 TOPS, 51.2 GB/s), and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the layer's Arithmetic Intensity (250 Ops/Byte) is high.", "Memory-bound, because its Arithmetic Intensity (4 Ops/Byte) is far below the hardware ridge point.", "Memory-bound, because its Arithmetic Intensity (250 Ops/Byte) is below the A17 Pro's ridge point (~683 Ops/Byte).", "Compute-bound, because 50 Giga-operations will saturate the A17's 35 TOPS compute capacity."], "correct_index": 2}}, {"id": "mobile-0338", "title": "The Mobile Memory Wall: Attention Scaling & Variants", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much memory is needed to load the FP16 weights of a 7B-parameter model for mobile inference?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["112 GB", "28 GB", "14 GB", "7 GB"], "correct_index": 2}}, {"id": "mobile-0339", "title": "The UI Jank Budget: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure a smooth user experience without causing UI 'jank', what is the maximum latency budget your model's inference can consume per frame on a typical 60Hz smartphone display?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "16 ms", "1 ms"], "correct_index": 2}}, {"id": "mobile-0341", "title": "The Dashcam Battery Drain Dilemma: Duty Cycling & Energy Harvesting", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much energy does the model consume during a 30-minute commute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.0 Wh", "30 Wh", "0.5 Wh", "120 Wh"], "correct_index": 2}}, {"id": "mobile-0342", "title": "The RAG Update Bill", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How large is a monthly OTA embedding update for 500,000 locations with 768-dim FP16 vectors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~96 MB", "~384 MB", "~768 MB", "~1.54 GB"], "correct_index": 2}}, {"id": "mobile-0343", "title": "The Mobile Energy Culprit", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which part of a background mobile neural network usually dominates energy use at the hardware level?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Executing the FP16/INT8 multiply-accumulate (MAC) operations in the NPU.", "Reading sensor data from the phone's accelerometer.", "Moving model weights and activations from system DRAM to the NPU's local memory.", "Writing inference results to the app's log file on flash storage."], "correct_index": 2}}, {"id": "mobile-0345", "title": "The Mobile Roofline Trap", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 10 GOps, 50 MB camera-filter layer memory-bound on an A17 Pro, and what roofline comparison shows it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the layer's Arithmetic Intensity is 200 Ops/Byte, which easily saturates the 35 TOPS ANE.", "Memory-bound, because the layer's Arithmetic Intensity (200 Ops/Byte) is less than the A17's Ridge Point (~683 Ops/Byte).", "Compute-bound, because 35 TOPS is a massive amount of performance, meaning compute limits execution.", "Memory-bound, because the Arithmetic Intensity is ~488 Ops/Byte."], "correct_index": 1}}, {"id": "mobile-0347", "title": "The Predictive Text Deadline", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Does the model meet the 500ms latency deadline for generating exactly 5 tokens?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["400ms. It meets the deadline with 100ms to spare.", "250ms. It meets the deadline with 250ms to spare.", "350ms. It meets the deadline with 150ms to spare.", "750ms. It fails to meet the deadline."], "correct_index": 2}}, {"id": "mobile-0348", "title": "The 'Magic Compose' Battery Budget", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total energy does the feature consume over a 16-hour active day?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["48 Wh", "0.3 Wh", "4.8 Wh", "3.0 Wh"], "correct_index": 2}}, {"id": "mobile-0349", "title": "The Battery Price of Privacy", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical total power draw of a modern smartphone SoC when it's actively running an ML workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 - 50 milliwatts", "30 - 60 Watts", "3 - 5 Watts", "700 Watts"], "correct_index": 2}}, {"id": "mobile-0353", "title": "Compute-Bound Keyboard Model Size Estimate", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Under a compute-only 35 TOPS peak estimate, what INT8 model size fits a 100ms one-token budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.75 Billion parameters", "3500 Billion parameters", "1.75 trillion parameters (1750 billion)", "17.5 Trillion parameters"], "correct_index": 2}}, {"id": "mobile-0354", "title": "The Airport Parking Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much total energy, in Watt-hours, will this feature consume from the car's battery over a continuous 24-hour period?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["96 Wh", "1152 Wh", "0.32 Wh", "19.2 Wh"], "correct_index": 2}}, {"id": "mobile-0356", "title": "The Core Benefit of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary benefit of using Federated Learning in this mobile context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It sends 1 TB of user data to a central server to be trained more efficiently on a large GPU cluster.", "It allows model training on local user data without the raw data (0 MB) ever leaving the device.", "It makes model inference faster on the mobile device by compressing the model before deployment.", "It encrypts 1 TB of user data before sending it to the server, where it is decrypted for training."], "correct_index": 1}}, {"id": "mobile-0357", "title": "The Federated Economics Trade-Off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What monthly data volume does centralized collection generate for 100,000 users uploading 50 MB per day for 30 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["5 TB", "150 GB", "150 TB", "1.5 GB"], "correct_index": 2}}, {"id": "mobile-0358", "title": "The Mobile Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the A17 Pro ridge point, and what does it say about when a workload becomes compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.46 Bytes/Op; your model must use less than this per operation to be efficient.", "~7 TOPS/W; this is the power efficiency you can expect from the chip.", "~684 Ops/Byte; a model's Arithmetic Intensity must exceed this to be compute-bound.", "~0.68 Ops/Byte; you only need one operation per byte to be compute-bound."], "correct_index": 2}}, {"id": "mobile-0360", "title": "The Jank Budget: Batching vs. Latency", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a user's request is the first to enter an empty batch, what is their effective Time to First Token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["12ms", "15ms", "25ms", "27ms"], "correct_index": 3}}, {"id": "mobile-0362", "title": "The OTA Double-Storage Tax", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total storage footprint required during this transition and what is the final number?", "chain_ids": ["mobile-chain-auto-001-06"], "chain_positions": {"mobile-chain-auto-001-06": 1}, "chain_tiers": {"mobile-chain-auto-001-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 MB", "150 MB", "200 MB", "250 MB"], "correct_index": 3}}, {"id": "mobile-0363", "title": "The Federated Learning Cost Advantage", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a Total Cost of Ownership (TCO) perspective, what is the primary economic advantage of using Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Faster model training due to massively parallel on-device computation.", "Reduced cloud storage and network egress costs from not uploading raw user data.", "Lower initial engineering cost because Federated Learning frameworks are open source.", "Elimination of all server-side computation costs."], "correct_index": 1}}, {"id": "mobile-0364", "title": "The MobileNet Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of this depth-wise convolution, and why is it memory-bound?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["AI is ~25,175 Ops/Byte; it's compute-bound.", "AI is ~9 Ops/Byte; it's compute-bound because it involves millions of operations.", "AI is ~4.5 Ops/Byte; it's memory-bound.", "AI is ~9 Ops/Byte; it's memory-bound because this is far below the A17's ridge point of ~683 Ops/Byte."], "correct_index": 3}}, {"id": "mobile-0366", "title": "The Real-Time Code Completion Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 50ms TPOT phone NPU meet 15 tokens/sec for one code-completion user, and what is the maximum number of parallel sessions it can support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it fails the spec because the NPU's 50ms latency is a larger number than the 15 tokens/sec requirement.", "Yes, it meets the spec. It can support any number of sessions as long as their total requested tokens are less than 20 per second.", "Yes, it meets the single-user spec. However, only 1 session can be supported because adding a second increases the effective latency to 100ms, violating the 66.7ms deadline.", "Yes, it meets the spec. It can support up to 3 concurrent sessions."], "correct_index": 2}}, {"id": "mobile-0367", "title": "The Dashcam Battery Drain: Duty Cycling & Energy Harvesting", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a user drives for one hour, how much total energy does your feature consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.0 Wh", "0.6 Wh", "1.0 Wh", "1.0 W"], "correct_index": 2}}, {"id": "mobile-0368", "title": "The OTA Battery Tax", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What fraction of a phone battery is used to download a 200 MB OTA model over cellular at 3W and 10 MB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.7%", "~3.15%", "~0.09%", "~0.03%"], "correct_index": 2}}, {"id": "mobile-0370", "title": "The On-Device Training Battery Tax", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total energy consumed over a 4-week test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3 Wh", "12 W", "12 Wh", "20 Wh"], "correct_index": 2}}, {"id": "mobile-0371", "title": "The Mobile Roofline Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the roofline ridge point represent, and what is it in operations per byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.68 Ops/Byte. (Forgot 1000x difference)", "7 TOPS/W. (Power efficiency, not ridge point)", "~684 Ops/Byte. Workloads must exceed this Arithmetic Intensity to be compute-bound.", "~68 Ops/Byte. (Misplaced decimal)"], "correct_index": 2}}, {"id": "mobile-0372", "title": "The Real-Time Keyboard Assistant", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the critical user experience advantage of using continuous batching in this scenario?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It maximizes overall tokens per second (TPOT) across all users.", "It reduces the model's on-chip memory footprint.", "It reduces the Time To First Token (TTFT) to avoid UI lag.", "It lowers total power consumption by requiring fewer CPU cycles for scheduling."], "correct_index": 2}}, {"id": "mobile-0373", "title": "The Unstable Queue: Mobile Latency Collapse", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the end-to-end latency for the fifth audio chunk when chunks arrive every 100ms but processing takes 120ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["120 ms", "140 ms", "200 ms", "600 ms"], "correct_index": 2}}, {"id": "mobile-0374", "title": "The Background Service Battery Killer", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a user receives one message every 50 seconds on average, what is the *average* power consumption of your feature over time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["20.0 mW", "40.0 mW", "39.8 mW", "1010.0 mW"], "correct_index": 2}}, {"id": "mobile-0375", "title": "The Mobile OTA Update Budget", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak storage is needed for a fail-safe 10 MB patch update of a 100 MB mobile model?", "chain_ids": ["mobile-chain-auto-001-10"], "chain_positions": {"mobile-chain-auto-001-10": 1}, "chain_tiers": {"mobile-chain-auto-001-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 MB (Assumes a risky in-place patch with no overhead)", "110 MB (Forgets a new file is created, just adds patch to original size)", "200 MB (Forgets to include the storage for the patch file itself)", "210 MB (Correctly accounts for original, patch, and new files)"], "correct_index": 3}}, {"id": "mobile-0376", "title": "The Federated Learning Bandwidth Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do centralized and Federated Learning daily bandwidth costs compare for 10M keyboard users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: $500/day, Federated: $5,000/day. Federated is 10x cheaper.", "Centralized: $50/day, Federated: $500/day. Federated is 10x more expensive.", "Centralized: $5/day, Federated: $500/day. Federated is 100x more expensive.", "Centralized: $5/day, Federated: $5/day. The costs are identical."], "correct_index": 2}}, {"id": "mobile-0377", "title": "The Mobile Roofline Test", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is a 50 GOPS, 100 MB convolution layer compute-bound or memory-bound based on roofline math?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because its arithmetic intensity (500 Ops/Byte) is less than the device's ridge point (~683 Ops/Byte).", "Compute-bound, because its arithmetic intensity (500 Ops/Byte) is less than the device's ridge point (~683 Ops/Byte).", "Compute-bound, because its arithmetic intensity is 5,000 Ops/Byte, which is greater than the device's ridge point.", "Compute-bound, because 35 TOPS is a very high peak value, so the workload is likely limited by compute."], "correct_index": 0}}, {"id": "mobile-0383", "title": "The Federated Learning Energy Bill", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What total daily energy does 20 seconds of 4W federated training consume across 1M phones in kWh?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.05 kWh", "~22,222 kWh", "~22.2 kWh", "~167 kWh"], "correct_index": 2}}, {"id": "mobile-0384", "title": "The Mobile NPU Ridge Point", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the A17 Pro roofline ridge point for 35 TOPS compute and 51.2 GB/s memory bandwidth?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.68 Ops/Byte. This is the arithmetic intensity.", "~0.0014 Bytes/Op. This is the memory required per operation.", "~684 Ops/Byte. This is the minimum arithmetic intensity required to be compute-bound.", "~5468 Ops/Byte. This is the ridge point, calculated by converting bytes to bits."], "correct_index": 2}}, {"id": "mobile-0385", "title": "The Mobile 'Jank' Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To ensure the UI remains perfectly smooth and never 'stutters', what is the approximate latency budget your model inference must meet for a 60Hz display?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "16 ms", "1 ms"], "correct_index": 2}}, {"id": "mobile-0386", "title": "The 'Instant' Assistant's Waiting Game", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What TTFT does a FIFO queue impose when a 2B INT8 assistant has 50 memory-bound tokens left at 51.2 GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 39ms, because 2 GB / 51.2 GB/s = 39ms, assuming no queuing delay.", "About 19ms, because the 35 TOPS NPU processes the 50 tokens instantly, leaving only memory overhead.", "About 39ms, assuming the new request can be immediately batched via static batching.", "Nearly 2 seconds, because the new request must wait for 50 tokens at 39ms/token (1,953 ms) to finish."], "correct_index": 3}}, {"id": "mobile-0389", "title": "The Federated Learning Energy Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a device energy consumption perspective, which single operation do you expect to be more expensive for the end-user's battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Sending the 1KB sample, because cellular data transmission is very power-hungry.", "The local training round, because of the high number of floating-point operations (FLOPs).", "The local training round, due to the high energy cost of repeated DRAM access for model parameters.", "They consume roughly the same amount of energy."], "correct_index": 2}}, {"id": "mobile-0391", "title": "The Cellular Energy Tax", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which uses more phone battery: uploading 1 MB over 4G for cloud inference or running a 100 MFLOP NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The on-device inference, by about 100x", "They are roughly equal in energy cost", "Sending the data, by over 10,000x", "Sending the data, but only by about 10x"], "correct_index": 2}}, {"id": "mobile-0394", "title": "The TCO Blindspot: Training vs. Inference", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Over a year, why can cumulative on-device inference energy dominate one-time cloud training energy?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The one-time model training cost, as datacenter GPUs have a very high TDP.", "The cumulative inference cost on user devices, due to the massive scale of the user base.", "They are roughly equivalent; the high power of training is balanced by the scale of inference.", "The energy cost of A/B testing different models before deployment."], "correct_index": 1}}, {"id": "mobile-0395", "title": "Why MobileNetV3 SE Blocks Improve Mobile Accuracy-Latency Trade-offs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why are MobileNetV3 squeeze-and-excitation blocks worth 2% extra FLOPs on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0397", "title": "The Trivial Model Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a CPU beat the NPU for a single 100-neuron dense layer on a modern mobile SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0399", "title": "The NCHW vs NHWC Memory Layout", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an NCHW PyTorch model run 3x slower than an NHWC-style TFLite model on an Android CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0401", "title": "The Real-Time Stutter", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose the performance bottleneck by calculating the total frame latency on the older device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is too slow. 25ms for 95 layers is the main bottleneck; the model needs to be pruned to run faster on the NPU.", "The 2ms context switch overhead is the primary issue, as it adds nearly 10% to the latency budget. The model must be re-architected to remove all unsupported ops.", "The total latency is ~42ms (25ms NPU + 15ms CPU + 2ms overhead), exceeding the 33ms budget. The 15ms spent on the CPU is the main bottleneck.", "A 95% delegation ratio is simply not high enough for real-time video; you must achieve at least 99% for the feature to be viable."], "correct_index": 2}}, {"id": "mobile-0404", "title": "The Fallback Tax", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should an iPhone 15 Pro replace unsupported FancyReLU layers to recover 12ms ANE latency without 0.5% accuracy loss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Implement a custom Metal GPU kernel for `FancyReLU` to accelerate its execution compared to the CPU.", "Apply INT8 quantization to the whole model to reduce the data size and speed up the CPU execution of `FancyReLU`.", "Replace `FancyReLU` with a similar, ANE-supported activation like `SiLU` (Swish) and retrain the model to recover the accuracy.", "Distribute the `FancyReLU` fallbacks across 10 threads to execute them concurrently, bringing total execution time under 1ms."], "correct_index": 2}}, {"id": "mobile-0408", "title": "Federated Learning Uploads Drain Battery Over Cellular", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the user complaints regarding data and battery drain?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 2}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 2GB memory footprint causes heavy use of slow flash storage and increasing power draw.", "The 50M parameter model update is approximately 200MB, and transmitting this large payload over a cellular network consumes excessive energy and data.", "The federated learning server is experiencing high latency, maintaining an open connection.", "50M parameters at FP16 is 50MB, causing minor drain. (Calculated trap for wrong precision)."], "correct_index": 1}}, {"id": "mobile-0410", "title": "The Catastrophic Accuracy Drop", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ drop an A17 Pro retail product model from 95% FP32 accuracy to 20% after office-only calibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too complex; INT8 format lacks the precision to represent the weights accurately.", "The Apple Neural Engine does not correctly support a key operator, causing a silent fallback to the CPU with incorrect results.", "The calibration dataset was not representative of the production data, causing activation values to overflow the INT8 range during inference.", "The conversion process introduced too much noise, and the model needs to be re-trained with quantization-aware training (QAT)."], "correct_index": 2}}, {"id": "mobile-0411", "title": "The Janky AR Filter", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which mixed-precision plan gets an AR style transfer filter under 33 ms while protecting one unstable depthwise layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantize the entire model to INT8 to achieve the maximum possible speedup.", "Keep the sensitive layer in FP16 but quantize the other 49 layers to INT8.", "The NPU is the bottleneck. Offload the model to run on the mobile GPU instead.", "The model is too large. Prune 25% of the channels from all layers and retrain the model."], "correct_index": 1}}, {"id": "mobile-0412", "title": "The Deceiving FLOPs Reduction", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a 9x FLOPs reduction from depthwise separable convolutions yield only a small mobile latency improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0413", "title": "The Transformer Tax on Mobile", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose the architectural reason for the ViT's poor latency on mobile hardware, despite its comparable size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The LayerNorm operations in the ViT are not supported by the ANE, causing slow CPU fallbacks.", "The ViT has a much larger total activation memory footprint, exceeding the ANE's on-chip SRAM.", "Self-attention's non-local memory access patterns have low arithmetic intensity, making it bottlenecked by DRAM bandwidth.", "The patch embedding (stem) layer of the ViT uses a large convolution that is inefficient on mobile GPUs."], "correct_index": 2}}, {"id": "mobile-0414", "title": "The Stuttering Generative Keyboard", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What most likely causes keyboard stutter when TTFT is 80ms and TPOT is 39ms against a 16.67ms UI deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large; it should be quantized from FP16 to INT8 to reduce TPOT.", "The inference workload is not being fully delegated to the Apple A17's Neural Engine, falling back to the CPU.", "The inference pipeline is blocking the UI thread; a non-blocking, continuous batching architecture is needed.", "The NPU is overheating and thermally throttling, causing intermittent slowdowns."], "correct_index": 2}}, {"id": "mobile-0415", "title": "The Lagging AR Bounding Box", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is batching four 60 FPS AR frames worse than optimizing single-frame inference below 16.67ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's arithmetic intensity is too low, making it memory-bound and thus unsuitable for batching.", "The system must process frames in real-time. The single-frame inference time (22ms) already violates the 16.67ms deadline, and batching will only increase total latency.", "Batching is correct. It will improve NPU utilization and system throughput, eventually clearing the backlog of frames.", "A dynamic batching system that adapts the batch size based on queue length should be implemented."], "correct_index": 1}}, {"id": "mobile-0416", "title": "The Mobile GPU's Memory-Go-Round", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an unfused Conv2D-BatchNorm-ReLU sequence with 1 MB intermediates slow, and what optimization fixes it?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0417", "title": "The Real-time Rendering Wall", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which model optimization is more likely to meet a 16ms mobile budget: unstructured pruning or dense distillation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Distillation only improves accuracy, it doesn't solve latency issues.", "Pruning halves the FLOPs, so latency will become 100ms, which is a significant improvement.", "A small, dense student model from distillation is NPU-friendly and will likely meet the 16ms budget, whereas the pruned model will see little speedup.", "Neither will work; the only solution is to run the model on the cloud via an API call."], "correct_index": 2}}, {"id": "mobile-0420", "title": "The Case of the Disappearing Pixels", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can ISP resizing from 12MP to 640x480 cause a major small-object accuracy drop despite fast NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0421", "title": "The Privacy Paradox TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What hidden FL TCO driver can dominate when daily on-device keyboard training drains about 2.2% battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 1MB daily cellular data upload for the model update is too expensive for users on limited data plans.", "The daily ~2.2% battery drain from the 5-minute on-device training is perceptible to users, causing them to disable the feature.", "The FL model updates are introducing inference latency ('jank') into the keyboard UI, leading to a poor user experience.", "The on-device training process is filling up the phone's storage with temporary files, causing 'storage full' warnings."], "correct_index": 1}}, {"id": "mobile-0422", "title": "The Mobile Style Transfer Stall", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an AdaIN layer with 1 Ops/Byte far below the A17 Pro ridge point, causing it to be severely bottlenecked by memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The device is thermal throttling, forcing the ANE to run at a lower frequency due to 16.8 M-Ops density.", "The layer is compute-bound; 16.8M operations takes 2ms, implying the ANE is operating at 8.4 GOPS peak.", "The layer is memory-bound; its Arithmetic Intensity (1 Op/Byte) is far below the hardware's Ridge Point (~683 Ops/Byte).", "The ANE is running at low TOPS to improve its TOPS/W power efficiency for this simple layer, resulting in 2ms execution time."], "correct_index": 2}}, {"id": "mobile-0423", "title": "The Corrupted Video Frame", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on mobile hardware pipelines, what is the most likely cause of this post-quantization visual artifacting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The INT8 format has lower precision, causing cumulative rounding errors to build up and corrupt the final output.", "The Apple A17 Neural Engine has a hardware bug in its INT8 matrix multiplication units that is triggered by this model's architecture.", "The calibration dataset was not representative, leading to activation values at inference time overflowing the INT8 dynamic range and getting clipped by the hardware.", "The model's FP16 weights were not properly converted, and their range is too large to fit into INT8, causing them to be clipped before inference begins."], "correct_index": 2}}, {"id": "mobile-0425", "title": "The NPU Architecture Dilemma: CNN vs. ViT", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why might MobileNetV3 beat a smaller-parameter ViT on a mobile NPU despite the ViT having fewer weights?", "chain_ids": ["mobile-chain-auto-secondary-010-09"], "chain_positions": {"mobile-chain-auto-secondary-010-09": 2}, "chain_tiers": {"mobile-chain-auto-secondary-010-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MobileNetV3, because the ViT's self-attention operations are severely memory-bandwidth bound on mobile NPUs.", "The ViT, because its self-attention mechanism consists of large matrix multiplies that can saturate the NPU's compute units.", "The ViT, because it has fewer parameters, which means a smaller memory footprint and faster memory access.", "They will have identical performance, as the NPU is designed to abstract away architectural differences."], "correct_index": 0}}, {"id": "mobile-0426", "title": "The Jank Frame Catastrophe", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the video filter spike from 20 ms to 55 ms when a system notification animates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too complex and must be pruned or quantized further to reduce its baseline latency.", "The DSP is not powerful enough to guarantee real-time performance.", "The OS scheduler is contending for shared GPU resources between your app's inference and the system's UI rendering.", "Thermal throttling from the 20ms inference forces the SoC to downclock instantly."], "correct_index": 2}}, {"id": "mobile-0428", "title": "The Jank-Inducing Mobile Generative Model", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What does the 'work-gap-work-gap' profiler pattern indicate, and how can operator fusion fix it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is memory-bound: apply unstructured pruning to reduce footprint to 15ms. (Trap: 15ms compute without addressing overhead).", "The model is compute-bound: distill the architecture to save 10ms of compute. (Trap: ignores the 10ms idle gaps).", "The model is dispatch-bound due to kernel launch overhead: use operator fusion to combine sequential operations into fewer, larger kernels.", "The SoC is thermally throttling: add a 10ms cool-down period. (Trap: misinterprets idle gap as thermal throttling)."], "correct_index": 2}}, {"id": "mobile-0430", "title": "The Cross-GPU Choke", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause for the ~600ms of unaccounted-for latency on the server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the server to the storage cluster is saturated.", "Reading the 32 GB tensor from the first GPU's HBM3 memory is the bottleneck.", "The GPUs are communicating over the PCIe bus instead of a direct NVLink bridge, which is saturated by the 32 GB transfer.", "The server's CPU is overloaded with OS context switching, causing the delay."], "correct_index": 2}}, {"id": "mobile-0431", "title": "The Privacy-Aware TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does breach risk change the TCO case for Federated Learning versus centralized keystroke logging?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cloud data ingress costs of $60 are far lower than the $250,000 FL engineering cost, so the cloud A/B test is cheaper.", "The on-device compute for FL will drain user batteries, leading to more churn than any potential data leak.", "The risk-adjusted centralized experiment cost is $100,000, making the one-time $250,000 FL investment economically rational after 2.5 experiments.", "A 5% churn from a data leak is unrealistic; the actual financial risk is likely zero, so the focus should be on the lowest direct cost."], "correct_index": 2}}, {"id": "mobile-0432", "title": "The Mobile Video Filter Jank", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is an 800 GOps, 80 MB video filter compute-bound or memory-bound, and what optimization follows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's memory-bound. The 51.2 GB/s LPDDR5 bandwidth is insufficient.", "It's power-bound. The model's TOPS/W efficiency is too low.", "It's compute-bound. Its arithmetic intensity of 10,000 Ops/Byte is significantly higher than the Ridge Point.", "It's latency-bound. The time to read from UFS 4.0 flash storage is the primary bottleneck."], "correct_index": 2}}, {"id": "mobile-0433", "title": "The Night Vision Quantization Failure", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does daytime-only INT8 calibration make a sign detector miss red stop signs at night?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Nighttime pixels cause integer overflow because the 255 max bin is exceeded by aggressive ISP sensor gain.", "The Snapdragon Hexagon DSP requires asymmetric quantization for low-light scenarios, which was omitted during export.", "The INT8 scale factor of 0.0627 mapped from daytime data forces the 1.0 nighttime dynamic range into only ~16 levels, causing severe quantization noise.", "The 10,000 calibration images exhausted the limited TFLite calibration buffer, forcing a stealth fallback to INT16."], "correct_index": 2}}, {"id": "mobile-0435", "title": "The Smart Reply Latency Puzzle: NAS vs. MoE", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose why the theoretically-efficient MoE model is slower in practice on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0436", "title": "The Real-Time Driver Alert Failure", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a driver alert pipeline meet average latency but fail its 100ms safety deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75ms is the average, so it meets the deadline. The failure must be a memory leak.", "The camera's 33.3ms (30 FPS) arrival rate exceeds the 40ms average inference time, causing a queue overflow.", "The cumulative effect of worst-case scheduler jitter across all pipeline stages pushes the total latency beyond the 100ms budget (110ms).", "The 50ms worst-case inference leaves only 50ms for the remaining 3 stages, which average 35ms, so it should succeed."], "correct_index": 2}}, {"id": "mobile-0439", "title": "The Slow Style Transfer", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What server interconnect issue explains multi-GPU style transfer data transfers taking about 8 seconds instead of milliseconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The GPUs are in different server racks, and the latency comes from the InfiniBand network connection between them.", "A CUDA driver misconfiguration is forcing data to be copied to CPU host memory before being transferred to the other GPUs.", "The server lacks an NVLink bridge, forcing the GPUs to communicate over the much slower PCIe bus.", "The 250 GB data transfer is simply too large for any interconnect, and this ~4 second latency is expected even with NVLink."], "correct_index": 2}}, {"id": "mobile-0440", "title": "The Live Filter Battery Drain", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an A17 Pro live portrait layer with 151M INT8 ops and 16.8 MB traffic battery-heavy and thermally unstable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; its AI of ~9,000 Ops/Byte exceeds the device's ridge point, meaning the NPU is the bottleneck.", "The layer is memory-bound; its AI of ~9 Ops/Byte is far below the device's ridge point of ~683 Ops/Byte, meaning the NPU is starving for data.", "The bottleneck is the INT8 precision; the A17 Pro's NPU is more efficient with FP16, and switching would resolve the thermal issue.", "The issue is weight-related cache misses; the 128-channel kernel is too large and is thrashing the L1/L2 cache during execution."], "correct_index": 1}}, {"id": "mobile-0441", "title": "The Headlight Saturation Problem", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this night-time failure, and how would you solve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile SoC's NPU lacks the computational power for this model. The model needs to be pruned or a more powerful SoC is required.", "The calibration dataset was not representative, leading to a narrow quantization range and activation overflow on high-dynamic-range night images. Re-calibrate with better data or use mixed-precision for the input layers.", "INT8 precision is fundamentally insufficient for safety-critical perception tasks. The entire model must be reverted to FP16, sacrificing the performance gains.", "This is a sign of a bug in the TFLite/CoreML converter's rounding implementation during quantization. You should try a different version of the conversion tool or report the issue."], "correct_index": 1}}, {"id": "mobile-0442", "title": "The Sluggish Car AI", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause for this massive discrepancy and the resulting UI jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU takes 130ms to compute the initial 100 tokens, blocking the 16ms render thread.", "The 77 GB/s memory bus restricts the prefill to 39ms, which still violates the 16ms render deadline 2 times.", "The prefill step is memory-bandwidth bound; the 195ms blocking call to read weights from DRAM stalls the UI thread.", "The Android OS scheduler is de-prioritizing the inference thread, causing context-switching delays."], "correct_index": 2}}, {"id": "mobile-0443", "title": "The Unstable Gallery Indexer", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 30 FPS video indexing queue grow infinitely when a 30ms/frame model shares the NPU with a 100ms/sec background task?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system has 10% headroom since 30ms < 33.3ms; the queue growth is likely a memory leak.", "The system processing time is 130ms, which is larger than the 33.3ms arrival rate.", "The 100ms background task increases the frame time to 130ms, missing the 33.3ms deadline.", "System utilization is 1.0; the effective service rate matches the arrival rate, causing queue instability."], "correct_index": 3}}, {"id": "mobile-0444", "title": "The AR Frame Drop", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 4K camera frame stall the GPU despite 10ms NPU inference in an AR overlay pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is taking longer than profiled, and the GPU is waiting for the detection results before it can start rendering.", "The unified LPDDR5 memory bus is saturated due to contention between the CPU, GPU, and NPU, slowing down the GPU's data fetch.", "The GPU is stalled waiting for the full 4K camera frame to transfer over the slow MIPI bus into DRAM; the peripheral bus is the bottleneck.", "A cache coherency delay between the NPU's write to memory and the GPU's read from memory is forcing a slow data synchronization."], "correct_index": 2}}, {"id": "mobile-0446", "title": "The Night-Vision Accuracy Collapse", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ keep 98.5% daytime accuracy but collapse below 50% on night clips with headlight activation spikes?", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 0}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile NPU has buggy INT8 support. Keep the problematic layers in FP16 (mixed precision) and quantize the rest.", "The model architecture is unstable. Add more Batch Normalization layers to better regulate activation distributions before re-quantizing.", "The calibration dataset is not representative of night-time conditions, causing activation overflow. Re-run quantization with a new calibration set that includes night-driving clips.", "Per-tensor quantization is too coarse. You must switch to per-channel quantization to provide a more fine-grained scaling factor for each filter."], "correct_index": 2}}, {"id": "mobile-0447", "title": "The Style Transfer Battery Drain", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a 70 GOps, 200 MB style transfer model memory-bound on an A17 Pro, and what should you optimize first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. The 70 G-OPS requires pruning because 70 / 35 TOPS = 2ms compute.", "The model is memory-bound because UFS flash storage is too slow to load weights.", "The model is memory-bound. Its AI of 350 Ops/Byte is below the ridge point (~683 Ops/Byte). Prioritize operator fusion.", "The model is compute-bound because the 35 TOPS NPU cannot sustain 30 FPS at 70 G-OPS."], "correct_index": 2}}, {"id": "mobile-0448", "title": "The Quantization Cliff", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can daytime-only INT8 calibration cause a driver drowsiness model to fail at night?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model architecture is too sensitive for INT8 and must be redesigned with explicit activation clipping functions like ReLU6.", "The calibration dataset was not representative of production data, causing activation saturation (overflow) during night-time inference.", "The mobile NPU has a hardware bug in its INT8 multiply-accumulate unit that causes it to produce erroneous high values.", "The per-tensor quantization scheme is flawed; switching to per-channel quantization is the only way to fix this."], "correct_index": 1}}, {"id": "mobile-0450", "title": "The Battery-Draining Vision Transformer", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the fundamental architectural reason for this significant performance regression on a mobile device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MLP blocks in the ViT are larger than the equivalent CNN layers, requiring more peak compute.", "Self-attention has poor data locality, causing a massive increase in energy-expensive random DRAM accesses.", "The Apple Neural Engine is not optimized for the LayerNorm and Softmax operations used in Transformers.", "The ViT's activation memory exceeds the L2 cache size, forcing it to use the slower main memory."], "correct_index": 1}}, {"id": "mobile-0451", "title": "The Driver Monitoring Deadline Miss", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 30 ms DMS pipeline miss a 50 ms deadline when a 50 ms gesture model runs once per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Landmark Prediction model at 15ms is too slow and must be optimized.", "The low-priority gesture model is blocking the NPU, causing DMS frames to be queued and miss their deadline.", "The hardware TOPS is insufficient for this combined workload.", "The camera's 30 FPS rate is too high, overwhelming the system."], "correct_index": 1}}, {"id": "mobile-0453", "title": "The Mystery of the Slow Avatar", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What backend communication misconfiguration can explain a 1.2s latency gap in a 4-GPU tensor-parallel avatar service?", "chain_ids": ["mobile-chain-auto-secondary-017-08"], "chain_positions": {"mobile-chain-auto-secondary-017-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The communication library (e.g., NCCL) is misconfigured, falling back to the kernel's TCP/IP stack instead of using a direct interconnect.", "The latency is dominated by 40GB of activations transferring over 10 Gbps Ethernet due to cross-node scheduling.", "The 4 GPUs are actually in different servers, and the 1.2s delay is the time spent transferring data over InfiniBand.", "The model's compute time is highly variable, and the 200ms figure is an average; P99 compute is likely over 1.4s."], "correct_index": 0}}, {"id": "mobile-0456", "title": "The Saturated Night-Vision Model", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can an INT8 dashcam sign detector work overall but fail completely on nighttime videos?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU drivers have a bug and don't support the specific convolution types used for processing low-light features.", "The model is suffering from catastrophic forgetting during Quantization-Aware Training (QAT).", "The calibration dataset lacked representative nighttime images, leading to incorrect quantization parameters that saturate activations for dark inputs.", "The FP16 model was already numerically unstable, and the reduced precision of INT8 caused activations to collapse to zero."], "correct_index": 2}}, {"id": "mobile-0457", "title": "The AR Jank Diagnosis", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which model change gets a 5M-parameter ViT AR filter under a 16 ms A17 Pro frame budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0459", "title": "The Mobile 'Training Cluster' Fallacy", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is a 40 Gbps USB4 phone cluster a show-stopper for 1B-parameter data-parallel fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The phone's OS (Android/iOS) lacks the necessary drivers and kernel support for RDMA or collective communication primitives.", "The sustained power draw from the NPU and CPU during training would cause the phones to overheat and thermally throttle, making performance unpredictable.", "The interconnect bandwidth of the USB4 hub is orders of magnitude too low, causing the gradient synchronization step (All-Reduce) to take longer than the computation itself.", "A 1B parameter model with Adam optimizers requires 16GB of memory, which would exceed the available RAM on most phones after accounting for the OS and other apps."], "correct_index": 2}}, {"id": "mobile-0461", "title": "The AR Filter Frame Drop", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What roofline calculation diagnoses the bottleneck for a 500 GOps AR filter moving 3.4 GB per frame on an A17 Pro?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 2}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. 500 G-Ops at 30 FPS requires 15 TOPS, which exceeds the A17 Pro's 35 TOPS at 50% utilization.", "The device is thermally throttling. 3.4 GB/s exceeds the 5 Watt SoC thermal limit.", "The model is memory-bound. Its Arithmetic Intensity of ~147 Ops/Byte is far below the A17 Pro's ridge point (~683 Ops/Byte).", "The OS scheduler is preempting the inference thread because 500 GOps takes 66ms on 35 TOPS."], "correct_index": 2}}, {"id": "mobile-0462", "title": "The Headlight Blind Spot", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ preserve daytime pedestrian accuracy but fail in high-contrast night scenes with headlights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's weights have overflowed the INT8 range because it was not trained with sufficient weight decay.", "The mobile NPU lacks an optimized kernel for the convolutions used in the early layers, causing a fallback to the much slower CPU and breaking the real-time pipeline.", "High-contrast headlight outliers inflate the PTQ activation scale, causing a catastrophic loss of precision for the majority of normal feature values.", "This is the expected, unavoidable accuracy drop from INT8. The model must be retrained from scratch using Quantization-Aware Training (QAT) to recover the lost accuracy."], "correct_index": 2}}, {"id": "mobile-0463", "title": "The Jank-Inducing Transformer", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does replacing a MobileNetV3 segmentation backbone with a ViT cause mobile video-call jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU lacks hardware support for GELU and LayerNorm operations, forcing them onto the much slower CPU and creating a pipeline stall.", "The ViT model needs to be more aggressively quantized from FP16 to INT8, as the NPU's integer units are currently underutilized.", "The ViT's O(N^2) self-attention mechanism creates large, non-local attention matrices that saturate the SoC's memory bandwidth, stalling the NPU.", "The model's parameter count is too large, exceeding the L2 cache capacity and causing constant, slow main memory access for weights."], "correct_index": 2}}, {"id": "mobile-0465", "title": "The AI Avatar's 4-Second Stall", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What likely causes a 4-second inter-GPU stall inside an 8-GPU H100 avatar inference server?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the GPUs is saturated.", "The model is too large, and the all-to-all data transfer is fundamentally limited by PCIe bandwidth.", "A software misconfiguration is forcing GPU communication over the PCIe bus instead of NVLink.", "The CPU is bottlenecking the system, preventing the GPUs from communicating efficiently."], "correct_index": 2}}, {"id": "mobile-0466", "title": "The Federated Learning Thermal Wall", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is sustained on-device federated training blocked by a mobile SoC thermal budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The network cost of uploading model updates daily from 1 million cars would be too expensive.", "The daily energy consumption from on-device training would unacceptably drain the car's main battery.", "The SoC will exceed its 5W thermal budget during the 10-minute training session, leading to severe performance throttling.", "The privacy risk of model gradients leaking sensitive facial data is fundamentally unsolved and a legal blocker."], "correct_index": 2}}, {"id": "mobile-0467", "title": "The Mobile Battery Drainer", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 70 GOps model with 350 MB of weights drain battery and leave the NPU underutilized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound because its 70 G-ops overwhelm the compute capacity.", "The model is memory-bound. The AI calculation (350MB / 70 G-ops) shows it is limited by data transfer.", "The model is memory-bound because its Arithmetic Intensity (200 Ops/Byte) is far below the ridge point (~683 Ops/Byte).", "The model is compute-bound because its Arithmetic Intensity of 200 Ops/Byte is a high number, meaning it is computationally complex."], "correct_index": 2}}, {"id": "mobile-0468", "title": "The Disappearing Cyclist", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ make a dashcam miss cyclists at night when headlights create activation outliers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The original FP16 model was overfit and must be retrained with more night-time data before quantization can be attempted.", "The A17 NPU has a hardware bug in its INT8 convolution kernels that is triggered by high-frequency image features.", "The calibration dataset was not representative, causing activation clipping (overflow) for the night-time cyclist images.", "The model's architecture is inherently unstable for quantization, and INT8 precision is insufficient for this computer vision task."], "correct_index": 2}}, {"id": "mobile-0481", "title": "The Mobile Jank Detective", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which NAS architecture should replace a janky mobile ViT, and why does arithmetic intensity matter?", "chain_ids": ["mobile-chain-auto-secondary-010-12"], "chain_positions": {"mobile-chain-auto-secondary-010-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model B (MoE ViT): It has lower total FLOPs, guaranteeing a direct reduction in latency.", "Model A (CNN): Its high arithmetic intensity overcomes the memory bandwidth bottleneck, maximizing NPU utilization.", "Model B (MoE ViT): Dynamic routing avoids the 51.2 GB/s memory bandwidth limit entirely.", "Model A (CNN): It requires less than 1 TOPS, avoiding the NPU's 35 TOPS thermal throttle."], "correct_index": 1}}, {"id": "mobile-0482", "title": "The On-Device Assistant Stutter", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you solve both the high TTFT and the UI jank?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Move the audio transcription to the CPU's efficiency cores and dedicate the NPU entirely to the LLM generation.", "Re-quantize the model to INT4 and apply pruning to reduce the weight size by 50%.", "Implement chunked prompt ingestion and prioritize decode steps over pre-fill steps in the NPU command queue.", "Increase the static batch size to process more user input at once, improving NPU utilization."], "correct_index": 2}}, {"id": "mobile-0483", "title": "The Data Center Mindset on Mobile", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can't we build a custom 64 GB/s PCIe-like interface for the phone to eliminate this bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The bottleneck is the mobile link's low bandwidth. We should use a newer, faster standard like USB4 v2 (80 Gbps) to get closer to PCIe speeds.", "The latency comes from serialization/deserialization overhead in the software stack. We should switch to a zero-copy protocol like FlatBuffers.", "The proposal is non-viable due to the prohibitive power consumption of a PCIe-like interface, which would exceed the phone's entire thermal and power budget. The correct approach is on-device model optimization.", "The external accelerator is too slow. The latency is an acceptable trade-off for higher quality, and the battery drain can be solved by asking the user to plug in their phone."], "correct_index": 2}}, {"id": "mobile-0486", "title": "The Night-Vision Overflow", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose and solve this catastrophic performance drop under night-time conditions?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The calibration dataset is not representative of night-driving conditions, causing activation saturation. Re-quantize with a more diverse dataset including night-time samples.", "Post-training quantization is too aggressive for this architecture. The only reliable solution is to implement full Quantization-Aware Training (QAT) to retrain the model from scratch.", "The NPU likely has a hardware bug related to sparse feature maps from IR images. The problematic layers should be forced to run on the more reliable CPU.", "The model's architecture is fundamentally unstable for 8-bit precision. It must be redesigned with more normalization layers."], "correct_index": 0}}, {"id": "mobile-0487", "title": "The Mobile UI Jank Diagnosis", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 20M-parameter ViT cause 25ms mobile UI jank, and what architecture class should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 20M parameters (40MB in FP16) are saturating the A17's 51.2 GB/s memory bandwidth. Quantizing weights to INT8 would halve the memory pressure and fix the jank.", "The A17's Neural Engine likely has poor support for the specific self-attention operations, causing the model to fall back to the much slower CPU.", "The Vision Transformer's self-attention produces large activation tensors that saturate memory bandwidth. The model is memory-bound (~4.6 GFLOPs but 25ms latency), and should be replaced with a CNN architecture that has better data locality.", "The model is too dense. Applying Mixture-of-Experts (MoE) routing would reduce the active parameter count per frame and thus the latency."], "correct_index": 2}}, {"id": "mobile-0488", "title": "The On-Device AI Stutter", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What implementation bug causes generated tokens to slow down over a sequence despite an initially fast first word?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is insufficient for a 1B parameter model, causing a compute bottleneck.", "The LPDDR5 memory bus is saturated from loading model weights for each token, causing a memory bandwidth bottleneck.", "The generation loop is performing a stateless re-computation of the entire sequence for each token due to a missing or misused KV cache.", "The mobile OS is thermal throttling the NPU, causing performance to degrade as generation continues."], "correct_index": 2}}, {"id": "mobile-0489", "title": "The Interconnect Blind Spot", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a cloud-trained tensor-parallel 3B model be a poor fit for mobile deployment and quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model relies on tensor-parallel execution over NVLink-class multi-GPU links, which is absent on a single mobile SoC.", "The training cluster's InfiniBand network allowed for faster data loading, and the mobile device's slow UFS flash storage is causing a data input bottleneck.", "The Apple A17 Pro's 35 TOPS NPU is simply not enough compute power for a 3B parameter model that was trained on 989 TFLOPS H100s.", "The model requires PCIe Gen5 to feed the accelerator, and the mobile SoC's internal bus protocol has much higher latency, starving the NPU."], "correct_index": 0}}, {"id": "mobile-0490", "title": "The Drowsy Driver Dilemma: Centralized vs. Federated TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which strategy yields a lower Total Cost of Ownership (TCO) for a 100K-vehicle driver drowsiness fleet: centralized collection or Federated Learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper: the $5,000 training cost is high but FL's daily uploads add up to more.", "The costs are roughly equivalent: as FL's high frequency of uploads cancels out its smaller data size.", "Federated Learning is cheaper by over $75,000 per year.", "Centralized is cheaper because the weekly cloud training cost of $5,000 dominates all other factors."], "correct_index": 2}}, {"id": "mobile-0493", "title": "The Mobile ViT Deadline Miss", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why can two 10 GFLOP bokeh models have radically different latency on an A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT's non-local memory access pattern for the attention map results in a high number of cache misses on the SoC, which is the primary bottleneck. The FLOP count is secondary.", "The Apple A17's Neural Engine lacks hardware acceleration for the specific Softmax or LayerNorm ops in the ViT, forcing them onto the CPU.", "The ViT's self-attention has a low Arithmetic Intensity, making it memory-bandwidth bound on the A17's memory system.", "The ViT's larger activation sizes are causing cache eviction and thrashing, but the issue is cache capacity, not bandwidth."], "correct_index": 2}}, {"id": "mobile-0494", "title": "The Stuttering AI Assistant: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the missed deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The A17 Pro's compute is insufficient for a 3B model, making the operation compute-bound.", "Continuous inference causes thermal throttling, increasing TTFT to > 60 ms.", "The 8ms static batching queue and the ~44ms memory-bound weight load collectively exceed the 16ms budget.", "The phone's CPU dispatch latency adds a 50 ms overhead before the Neural Engine starts."], "correct_index": 2}}, {"id": "mobile-0496", "title": "The Federated Photobomber Fallacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What cost makes the proposed Federated Learning training plan impractical despite privacy benefits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The network bandwidth required to upload model updates from 10 million clients across 1,000 rounds is the primary cost bottleneck.", "The cumulative on-device energy consumption will lead to unacceptable battery drain for users.", "The cloud compute cost for the central server to aggregate gradients from millions of clients will be the most expensive part of the system.", "The model will likely have poor final accuracy due to non-IID data from users, making the effort technically infeasible regardless of cost."], "correct_index": 1}}, {"id": "mobile-0499", "title": "The Mobile Transformer Jank Puzzle", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an A17 Pro ViT video filter hit only 15 FPS with 30% ANE utilization but saturated memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too small for the NPU, and the batch size should be increased to improve utilization.", "The NPU driver has a bug causing inefficient scheduling of Transformer operations, and it should be reported to the vendor.", "The ViT's self-attention is memory-bound; its arithmetic intensity is too low for the A17's hardware balance.", "The model must be quantized from FP16 to INT8, as the computational load is clearly too high."], "correct_index": 2}}, {"id": "mobile-0500", "title": "The SoC Shuffle Tax", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the SoC shuffle tax, and how does CPU-NPU ping-ponging an 8 MB tensor hurt latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0503", "title": "The Transformer's Mobile Traffic Jam", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does a ViT object detector with a similar parameter count run far slower than MobileNetV2 on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT has a larger total activation memory size, which overflows the NPU's limited on-chip SRAM.", "The Vision Transformer has significantly more total FLOPs (Floating Point Operations) than the MobileNetV2.", "The ViT's self-attention has low Arithmetic Intensity, making it memory-bandwidth bound on the NPU.", "The A17 Neural Engine lacks optimized hardware kernels for the Softmax and LayerNorm operations within the ViT."], "correct_index": 2}}, {"id": "mobile-0504", "title": "The Sluggish Smart Reply: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching make smart reply TTFT exceed 300 ms even though TPOT is 80 ms?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 2}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 2B parameter model is too large for the mobile NPU, causing slow inference.", "The static batching policy is introducing excessive queuing delay before inference begins.", "LPDDR5 memory bandwidth is saturated, creating a bottleneck when loading model weights for each batch.", "The CPU is too slow at tokenizing and preparing the input tensors, starving the NPU."], "correct_index": 1}}, {"id": "mobile-0505", "title": "H100 PCIe Sync Bottleneck in a Mobile Assistant Backend", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What backend GPU interconnect bottleneck most likely creates the 85ms sync delay for the mobile assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The InfiniBand network connecting the servers in the rack is saturated.", "The model is too large, causing the compute phases on the H100s to be the bottleneck.", "The two GPUs are communicating over the PCIe bus instead of NVLink, likely due to a server misconfiguration.", "The GPU HBM is full, causing the system to swap activations to slower system DRAM."], "correct_index": 2}}, {"id": "mobile-0506", "title": "The Privacy vs. Price Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which first-year Total Cost of Ownership (TCO) is lower for mobile keyboard autocorrect: centralized training or Federated Learning?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 2}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper ($75k vs $236.5k) due to high FL CapEx.", "FL is cheaper ($36.5k vs $50k) due to lower cloud compute.", "FL is cheaper ($36.5k vs $75k) due to no data storage costs.", "Centralized is cheaper ($50k vs $200k) due to server costs."], "correct_index": 0}}, {"id": "mobile-0510", "title": "The Stuttering On-Device Assistant", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling architecture should an on-device voice assistant use to reduce queueing delay during generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply aggressive 4-bit quantization to the model to reduce TPOT below the 16ms deadline.", "Redesign the UI to be fully asynchronous, only displaying the full text once the entire sequence is generated in the background.", "Implement continuous batching to process new user input at the token-level alongside existing generation, minimizing queueing delay.", "Conclude the NPU TOPS are insufficient and offload inference to the cloud."], "correct_index": 2}}, {"id": "mobile-0511", "title": "The Distributed Training Scaling Failure", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What cross-node interconnect bottleneck explains poor scaling when teacher-model training grows from 2 to 8 nodes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["PCIe saturation between GPUs and CPUs is the bottleneck, even though profiling shows normal intra-node behavior.", "NVLink bandwidth within each node is the bottleneck, even though the profiler shows normal NVLink traffic.", "Standard Ethernet is forcing cross-node AllReduce through the TCP/IP stack instead of an RDMA-capable fabric like InfiniBand.", "Low arithmetic intensity is the bottleneck, so larger batches are needed despite the profiler showing high AllReduce latency."], "correct_index": 2}}, {"id": "mobile-0512", "title": "Federated Learning for Vehicle Voice Assistant Upload Cost", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What strategy avoids astronomical cellular upload costs while preserving privacy for a 1M-vehicle voice assistant A/B test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0514", "title": "The Saturated AR Filter", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ make a MobileNetV3 AR filter glitch only in bright outdoor scenes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The post-training quantization (PTQ) calibration dataset was not representative of production data, causing activation values to overflow the INT8 range in bright scenes.", "The Neural Engine has a documented hardware bug when performing INT8 convolutions.", "MobileNetV3's architecture uses swish activations which are numerically unstable in INT8.", "The model was quantized using dynamic quantization, which is too slow."], "correct_index": 0}}, {"id": "mobile-0515", "title": "The Infotainment Jank", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which architecture should replace a janky ViT on an infotainment system, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is memory-bandwidth-bound due to attention's low arithmetic intensity. Choose the MobileNetV3, as its convolutional structure provides better data locality.", "The ViT is compute-bound because 22M parameters is too large. Choose the ViT-MoE.", "The problem is model size. Choose the ViT-MoE as it has fewer active parameters.", "The bottleneck is likely an inefficient operator in the ViT model that isn't supported."], "correct_index": 0}}, {"id": "mobile-0516", "title": "The Laggy AI Assistant", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 100ms static batching timeout make mobile AI suggestions feel laggy during fast typing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 150ms TTFT is too slow for a real-time mobile application and must be optimized to be less than the 100ms request arrival interval.", "The max batch size of 4 is too large, causing high latency. Reducing it to 1 would solve the problem.", "The 100ms static batching timeout can add avoidable wait before the 150ms TTFT, so suggestions can arrive around 250ms after a keystroke and become stale during fast typing.", "The latency is caused by the 100ms batching interval compounding with 150ms TTFT, leading to exactly 1.5 seconds of queue wait time after 6 keystrokes."], "correct_index": 2}}, {"id": "mobile-0517", "title": "The Federated Economics Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 10M smart-reply DAU, how does centralized logging TCO compare with an $800K federated-learning buildout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Option A (Centralized) is better because the $800,000 upfront engineering cost for FL makes it too expensive.", "Option A (Centralized) has a much lower infrastructure TCO (~$4.2k vs ~$327k/year), but Option B (Federated) may be the true cheaper option when regulatory and breach risks are factored in.", "Option A (Centralized) is better; the daily costs are a standard operational expense and the privacy risk can be mitigated with user agreements.", "Option B (Federated) is cheaper in pure infrastructure costs, even when amortizing the engineering cost."], "correct_index": 1}}, {"id": "mobile-0520", "title": "The Overexposed Image Crash", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 calibration on lab images make a smart-exposure model fail outdoors in bright sun?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's weights have a dynamic range too large for INT8, causing weight overflow during conversion.", "The NPU has a known hardware bug when handling certain INT8 convolution operations.", "The calibration dataset was not representative of outdoor scenes, leading to activation values overflowing the INT8 range.", "The increased temperature of the phone from being in the sun caused thermal throttling."], "correct_index": 2}}, {"id": "mobile-0522", "title": "The Sluggish Co-Pilot", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What inference phase explains an 800ms TTFT with 50ms TPOT in an on-device 3B LLM assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too large, making the NPU completely memory-bound for both phases.", "The model is being loaded from slow flash storage on every request, causing high cold-start latency.", "The initial prompt processing (pre-fill) is highly compute-bound due to the large GEMM operation required for the entire prompt sequence.", "The CPU is taking too long to tokenize the user's text input before sending it to the Neural Engine."], "correct_index": 2}}, {"id": "mobile-0523", "title": "The Multi-Node Scaling Cliff", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does scaling from 8 to 32 H100s improve throughput only 20% when AllReduce consumes 70% of each step?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated from transferring gradients between the GPU and the network card for inter-node communication.", "The model's activations are causing HBM cache misses, and the system is stalling on main memory access.", "Communication is bottlenecked by the ~18x bandwidth drop when moving from the 900 GB/s intra-node NVLink to the ~50 GB/s inter-node InfiniBand fabric.", "The CPU is unable to schedule the `AllReduce` kernels fast enough across the 32 GPUs, creating a dispatch bottleneck."], "correct_index": 2}}, {"id": "mobile-0524", "title": "The Real-Time Filter Lag", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the custom real-time filter layer memory-bound or compute-bound, and what arithmetic intensity evidence supports that?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound because 7 GOps takes 200ms on a 35 TOPS chip.", "The layer is memory-bound because its Arithmetic Intensity (140 Ops/Byte) is less than the A17 Pro's ridge point (~683 Ops/Byte).", "The layer is compute-bound because its Arithmetic Intensity is 0.14 Ops/Byte (7/50), below the ridge point.", "The layer is memory-bound because 50 MB takes 50ms at 1 GB/s."], "correct_index": 1}}, {"id": "mobile-0526", "title": "The Keyboard Jank Crisis", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What deployment efficiency path best fixes a 50M Transformer keyboard model that misses latency and memory budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is memory-bandwidth bound. Switch to a MobileNet-style CNN using depthwise separable convolutions to create larger, more parallelizable operations.", "The model is too large. Shrink the Transformer to 25M parameters to cut the memory and compute costs by 50%, which will meet the latency target.", "The NPU is being starved. Use a Neural Architecture Search (NAS) to automatically find a more efficient Transformer block.", "The model is memory-bandwidth bound. Apply 4-bit quantization to the existing model to reduce weight traffic and meet the latency and memory targets."], "correct_index": 3}}, {"id": "mobile-0527", "title": "The On-Device Copilot's Janky Keyboard", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching four 20-token keyboard predictions on A17 Pro cause 16ms UI deadline misses?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU is too slow. With a 25% utilization rate, it is clear the hardware cannot keep up with the 10 request/sec arrival rate, causing the request queue to grow indefinitely and block the UI thread.", "The memory bandwidth is the bottleneck. The model is too large, and the NPU is spending most of its time waiting for data from DRAM instead of computing, which is why the processing time is so long.", "The static batching creates head-of-line blocking. The 100ms uninterruptible batch processing time exceeds the 16ms UI deadline, causing frame drops. A switch to continuous batching is needed.", "The batch size is too large. Reducing the batch size to 1 is the only way to minimize latency and ensure the 16ms deadline is never missed, even if it means lower overall throughput."], "correct_index": 2}}, {"id": "mobile-0528", "title": "The Privacy vs. Profit A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What net daily value do centralized 95% and federated 80% smart-reply models create for 1M users at 10 suggestions/day?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cloud model is superior by $17,500 per day, as it generates far more value from accepted suggestions.", "The Federated Learning model is the only responsible choice, as the potential cost of a data breach outweighs any daily revenue metric.", "The Cloud model is only marginally better, with a net value gain of $14,000 per day.", "The Federated Learning approach has a daily opportunity cost of $9,000, forcing a decision between immediate user value and long-term privacy strategy."], "correct_index": 3}}, {"id": "mobile-0529", "title": "The Cross-Country Trip on a City Bus", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the cloud LLM server showing 500ms P99 latency despite fast network RTT and high GPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The server is compute-bound because GPU utilization is at 90%, and a more powerful GPU like the B200 is needed.", "The HBM3 memory latency is too high (~300 ns), creating a memory wall that stalls the GPU cores.", "The system is communication-bound due to using the low-bandwidth PCIe bus for tensor parallel exchanges instead of the high-bandwidth NVLink fabric.", "The data center's InfiniBand network is saturated, as the 8 GPUs are likely in different racks, causing high latency between them."], "correct_index": 2}}, {"id": "mobile-0532", "title": "The Mobile-ViT Latency Trap", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why is the Micro-ViT disproportionately slower than MobileNetV3 on A17 Pro, and what architecture best balances accuracy and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT is compute-bound because 1.5 GFLOPs / 30ms = 50 GFLOPS, exceeding the hardware's sustained rate.", "The ViT model is too large for the cache. We must apply more aggressive INT4 quantization and 50% unstructured pruning to reduce the memory footprint.", "The ViT is memory-bandwidth bound due to the low arithmetic intensity of self-attention. Propose a hybrid CNN-Transformer architecture, potentially found via NAS, that uses efficient depthwise separable convolutions for early stages and attention for later stages.", "Increase model capacity by using a Mixture of Experts (MoE) layer, which keeps inference FLOPs constant by only activating one expert. This will improve accuracy without increasing latency."], "correct_index": 2}}, {"id": "mobile-0533", "title": "The Janky Visual Search", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given that the mobile network latency is consistently around 50ms and the model's compute-per-expert is stable, what is the most likely cause of this high latency variance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The backend's InfiniBand network, used for RDMA, is dropping packets, causing high-latency fallbacks.", "The model's KV-cache is spilling from HBM to much slower system RAM, but only for certain inputs that activate the largest experts.", "The server lacks full NVLink connectivity, and the MoE router is picking experts on GPUs across different CPU sockets, forcing slow communication over the PCIe bus and inter-socket links.", "The mobile client's cellular connection has high packet loss, and TCP retransmission delays are causing the latency spikes."], "correct_index": 2}}, {"id": "mobile-0534", "title": "The Mobile Style Transfer Jank", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the 200 GOps style transfer layer compute-bound or memory-bound, and what roofline evidence supports that diagnosis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound. The 35 TOPS of the NPU is insufficient to execute 200 Giga-ops within the 16ms time budget.", "Memory-bound. The layer's Arithmetic Intensity (400 Ops/Byte) is below the hardware ridge point (~683 Ops/Byte).", "Thermally-bound. The SoC is likely overheating and throttling the NPU clock speed, a common issue in slim devices under sustained load.", "Memory-bound. The 500 MB of data is too large to fit in the on-chip caches, forcing slow reads from main RAM."], "correct_index": 1}}, {"id": "mobile-0536", "title": "The Laggy Mobile Video Filter", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What backend topology problem most likely explains the 150ms P99 latency for the cloud-based mobile video filter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile client's 5G connection has high jitter, causing unpredictable packet delays.", "The data transfer from the server's network card to the GPU over the PCIe bus is the primary bottleneck.", "The backend orchestrator is placing the two model GPUs on different servers, forcing traffic over the slower InfiniBand network instead of NVLink.", "The H100 GPU is running too hot under load and is being thermally throttled, increasing inference time."], "correct_index": 2}}, {"id": "mobile-0537", "title": "The Mobile Video Battery Drain", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 3x3 depthwise convolution on a 112x112x256 INT8 tensor drain battery on A17 Pro?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The layer is compute-bound; the 3x3 kernel requires too many FLOPs for the NPU to sustain 60 FPS.", "The layer is memory-bound; its arithmetic intensity (9 Ops/Byte) is far below the A17's ridge point, stalling the NPU.", "Thermal throttling is artificially reducing the ridge point to 10 Ops/Byte, matching the layer's intensity.", "The INT8 data type induces a 4x penalty on memory bus transfers compared to native FP16 execution."], "correct_index": 1}}, {"id": "mobile-0538", "title": "The Headlight False Positive", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the root cause of the catastrophic INT8 accuracy drop during night driving with bright headlights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The SoC's NPU has a hardware bug when processing high-frequency image data at night.", "The CNN architecture has overfit to daytime images and is not robust to the domain shift of night driving.", "The INT8 calibration range is too narrow, causing activation value saturation when encountering high-contrast night scenes.", "The camera's auto-exposure control is failing at night, delivering incorrectly normalized input frames to the model."], "correct_index": 2}}, {"id": "mobile-0539", "title": "The On-Device Search Dilemma", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the ViT miss the 16ms mobile jank budget compared with the NAS-proposed depthwise CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT's non-local memory access pattern for the attention map results in a high number of cache misses on the SoC, which is the primary bottleneck. The FLOP count is secondary.", "The Apple A17's Neural Engine lacks hardware acceleration for the specific Softmax or LayerNorm ops in the ViT, forcing CPU fallback.", "The ViT's computational cost scales quadratically with input tokens, making it ~7.4x more expensive (~67 MFLOPs vs ~9 MFLOPs). This FLOP gap is the root cause.", "The model is bottlenecked by memory bandwidth, not compute. The fix is to quantize from FP16 to INT8, halving data movement and solving latency without changing architecture."], "correct_index": 0}}, {"id": "mobile-0540", "title": "The Live Translation Freeze", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What causes the live translation feature's excessive TTFT when static batching waits for four pending requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's single-token inference time of 40ms is too slow for the 80ms budget and requires further optimization.", "The Snapdragon 8 Gen 3 NPU is saturated by a batch of 4, causing thermal throttling and increased latency.", "The static batching policy creates excessive queueing delay, as early requests are starved waiting for the batch to fill.", "The device's LPDDR5x memory bandwidth is insufficient for batching, causing stalls when moving four requests' data to the NPU."], "correct_index": 2}}, {"id": "mobile-0541", "title": "The Federated Learning TCO Trap", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What drives the much higher network TCO of the federated keyboard personalization variant, and how should the trade-off be framed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cost is high because 5% of 5 million users is too large; the participation rate should be lowered to 0.1% to make it cost-effective.", "The FL approach transfers 50,000x more data per day (5 TB vs. 100 MB). This higher TCO is a direct trade-off for enhanced user privacy and reduced data breach liability, as no raw text ever leaves the device.", "The real cost isn't the network, but the on-device compute draining user batteries. The network cost is a secondary concern that can be ignored.", "The centralized approach is cheaper and therefore better. Properly anonymized text is 'good enough' for privacy, and the massive cost savings are the most important business factor."], "correct_index": 1}}, {"id": "mobile-0542", "title": "The Mobile App's Cloud Bottleneck", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the AI Retouch backend spend 250ms moving only 50MB over PCIe Gen5?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Gen5 bus is saturated and cannot handle the 50MB transfer load efficiently.", "Model execution (400ms) is the primary bottleneck; the transfer time is secondary and should be ignored.", "The data is being moved in thousands of small chunks, making the transfer latency-bound.", "The server's InfiniBand network is causing contention on the PCIe bus, slowing down the transfer."], "correct_index": 2}}, {"id": "mobile-0543", "title": "The Real-Time Filter Jank", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using roofline analysis, what bottleneck explains the real-time video filter's 24ms frame latency on an A17-class device?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 2}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is compute-bound. An arithmetic intensity of 58.3 Ops/Byte is very high, indicating a workload dominated by calculations.", "The model is compute-bound. The A17 Pro's 35 TOPS is not sufficient to process 70 G-Ops within the 16ms frame budget.", "The model is memory-bound. Its arithmetic intensity of 58.3 Ops/Byte is significantly lower than the A17 Pro's ridge point of ~700 Ops/Byte.", "The bottleneck is thermal throttling. The workload is exceeding the SoC's power budget, forcing it to slow down, regardless of the model's specifics."], "correct_index": 2}}, {"id": "mobile-0544", "title": "The Night-Vision Blind Spot", "topic": "real-time-deadlines", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ make a pedestrian detector miss night scenes when daytime calibration has only 100 images?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MobileNet architecture's Hard-Swish activations are unstable for quantization and are causing numerical errors.", "The phone's NPU has a hardware bug in its INT8 arithmetic unit that incorrectly handles high-value multiplications.", "The calibration dataset lacked representative nighttime images, leading to incorrect quantization parameters that saturate activations for dark inputs.", "The model is overfitting to the training data and needs to be completely retrained with a lower learning rate and more regularization."], "correct_index": 2}}, {"id": "mobile-0545", "title": "The MobileNet Migration Diagnosis", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which architectural change should replace the slow ResNet-style layer to meet the mobile visual sticker budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0546", "title": "The AI Keyboard Jank", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching cause the AI keyboard to freeze despite acceptable single-request TTFT and TPOT?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The latency equals (80ms + 50ms) * 4 = 520ms, proving the NPU is saturated and batch size must decrease.", "The 80ms TTFT is inherently too slow for real-time UI, causing the jank.", "Static batching forces short requests to wait ~530ms for the longest sequence to finish via head-of-line blocking.", "The average latency of 330ms indicates the shared LPDDR5 memory bus is saturated."], "correct_index": 2}}, {"id": "mobile-0547", "title": "The Topologically Flawed Upgrade", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is replacing InfiniBand with NVLink a flawed fix for multi-node AllReduce bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["InfiniBand NDR (400 Gbps) provides higher sustained bandwidth across nodes than NVLink 4.0 (900 GB/s) due to overhead.", "It is better to connect the nodes directly using PCIe Gen5 for lower latency.", "NVLink is an intra-node interconnect for GPU-to-GPU communication within a server, while InfiniBand is for inter-node communication.", "The bottleneck is a software issue in the AllReduce algorithm's implementation, not a hardware limitation."], "correct_index": 2}}, {"id": "mobile-0550", "title": "The Sluggish Voice Assistant", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the in-car voice assistant have slow TTFT even though subsequent tokens generate quickly?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 2}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The prompt prefill computation is too large, exceeding the NPU's per-operation compute budget and causing a hardware fault.", "The system is running out of on-chip memory when both tasks are loaded, forcing slow data swaps from main DRAM during prefill.", "The FIFO queue is causing head-of-line blocking; the high-priority voice query is stuck waiting for the entire low-priority background task to complete.", "The fast TPOT generation for the previous task is saturating the memory bus, preventing the voice prompt from being loaded onto the NPU."], "correct_index": 2}}, {"id": "mobile-0551", "title": "The Mobile App's Server-Side Stall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What server-side data ingress bottleneck explains the AI Photo Editor's missing 105ms of latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The mobile device's 5G uplink has 105ms of high packet loss, forcing numerous retries.", "Data transfer over the server's PCIe bus from CPU RAM to GPU HBM adds ~105ms due to software-mediated copies.", "The H100 server uses InfiniBand to connect to storage, adding 105ms of latency.", "The server's NVLink interconnects are saturated, adding 105ms to the data copy."], "correct_index": 1}}, {"id": "mobile-0554", "title": "The Mobile NAS Showdown: CNN vs. Sparse Transformer", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which NAS candidate should be chosen for the NPU video filter, and why does hardware efficiency outweigh raw operation count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Candidate B, because its total active operations (47.5 GOps) are significantly lower than the original 100 GOps ViT.", "Candidate B, because Transformers are architecturally superior to CNNs and MoE makes them efficient enough for mobile deployment.", "Candidate A, because its regular, dense structure achieves much higher architectural efficiency (eta_arch) on the mobile NPU, resulting in a latency of ~1.1ms vs ~4.5ms for the MoE model.", "Candidate A, because its total GOps (30) are lower than the MoE ViT's (47.5), and lower GOps always means lower latency."], "correct_index": 2}}, {"id": "mobile-0555", "title": "The Stuttering Translator", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batch size 4 cause a 500 ms first-token delay for a 1B translator on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 1B model size causes a 500ms memory bottleneck because 1GB / 2 GB/s effective bandwidth = 500ms.", "The 35 TOPS NPU takes 500ms to process the 4-token prefill batch.", "The static batch size of 4 forces a ~500ms queuing delay while waiting for user input. Switch to continuous batching.", "The 51.2 GB/s bandwidth limits prefill of 4 batched tokens to ~500ms."], "correct_index": 2}}, {"id": "mobile-0556", "title": "The Lane Centering TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which retraining strategy is economically viable for the fleet, and what data-transfer cost drives the decision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized Training, because the annual cost is around $66,000, which is a reasonable R&D expense.", "Federated Learning, because the Centralized approach would cost over $1.2 million annually in data fees.", "Centralized Training, as the powerful cloud GPUs result in a better model, and data costs are negligible.", "Federated Learning, to avoid user complaints about battery drain."], "correct_index": 1}}, {"id": "mobile-0557", "title": "The Laggy Generative Uncrop", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What inter-GPU communication bottleneck explains the 1.5s latency of the Generative Uncrop backend?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0559", "title": "The Saturated Stop Sign", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this catastrophic accuracy drop for a single class, and how would you solve it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Snapdragon NPU has a known bug with certain convolutional operators when handling saturated INT8 values. You should force this specific layer to run on the CPU or GPU instead of the NPU.", "The model requires higher precision for this feature. You should use mixed precision, keeping the final convolutional and classification layers in FP16 while quantizing the rest to INT8.", "The weights in the layer have a high dynamic range, not the activations. You should apply per-channel quantization to the weights of the final convolutional layer instead of per-tensor.", "The high activation values from stop signs are creating a large dynamic range, crushing the quantization resolution for all other classes. You should use a percentile-based clipping calibration method to ignore these rare outliers."], "correct_index": 3}}, {"id": "mobile-0560", "title": "The Mobile MoE Trade-off", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which option demonstrates the most sophisticated understanding of on-device constraints and is the most promising to investigate further?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Micro-ViT, because Transformers are the most powerful modern architecture and worth the integration cost.", "The EfficientNet-B0, because it's a standard architecture that provides a performance boost and comfortably fits the latency budget.", "The MoE Hybrid, because it uniquely decouples model capacity (total parameters) from inference cost (active FLOPs), fitting the task and budget.", "Both the EfficientNet-B0 and the MoE Hybrid are equally valid choices since both are well under the 25ms latency budget."], "correct_index": 2}}, {"id": "mobile-0561", "title": "The Stuttering Voice Assistant", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does static batching make the on-device voice assistant emit words in bursts instead of a smooth stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The NPU's 10ms-per-token performance is too slow; the model must be quantized from FP16 to INT8 to increase raw token throughput.", "The LPDDR5 memory bandwidth is saturated by loading activations for the whole batch. The model's hidden dimension size must be reduced.", "The static batching forces each user to wait for the entire batch's tokens to be generated for every word, causing high TPOT. Switch to continuous batching to decouple requests and stream tokens back smoothly.", "The 50ms batching timeout is too high, causing unacceptable TTFT. The timeout should be reduced to 10ms to make the system more responsive."], "correct_index": 2}}, {"id": "mobile-0563", "title": "The Predictive Charging TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which predictive charging training approach is cheaper, and how does participation rate change the TCO calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized is cheaper by ~$131,400 because FL uploads 5MB per user instead of 1MB.", "Federated Learning is more expensive by ~$147,825 assuming 100% participation for 5MB uploads.", "Federated Learning is cheaper by ~$16,425 because the 10% participation rate offsets the 5x larger payload.", "Both approaches cost exactly $32,850 because bandwidth compresses model updates."], "correct_index": 2}}, {"id": "mobile-0564", "title": "The Keyboard's Privacy-Cost Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which smart keyboard personalization approach should the A/B test use, considering privacy, battery, and data TCO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Choose Centralized (A). The battery drain from on-device training is unacceptable and will cause user churn.", "Choose Centralized (A). The annual data upload cost of $24,000 is high, but still cheaper than the data costs from federated learning updates.", "Choose Federated (B). It has a lower annual operating TCO (~$3.6K vs ~$24K) and critically, avoids uploading sensitive user keystrokes, making it the only viable option from a privacy standpoint.", "Choose Federated (B), because its annual data TCO is only $3,600, which is an order of magnitude cheaper than the Centralized approach's multi-million dollar data bill."], "correct_index": 2}}, {"id": "mobile-0565", "title": "The Smart Keyboard TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which smart keyboard training strategy has the lower full TCO after accounting for network and user battery costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0566", "title": "The Mobile 'Pro' Upgrade Dilemma", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should effective throughput be calculated to determine which upgrade option satisfies the 20ms latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MoE model adds 50% more parameters, so its latency will increase by 50% to 27ms, making it unviable.", "Scaling the dense model to 6 G-MACs will double the latency to 36ms, but the MoE model will have a latency of ~19.8ms, making it the only viable option.", "Since the Snapdragon NPU has 45 TOPS, both models are easily within budget. We should choose the simple scaled model.", "Both approaches increase MACs, so both will fail the latency budget. It's impossible to improve accuracy without increasing latency."], "correct_index": 1}}, {"id": "mobile-0567", "title": "The 'Smart Reply' Battery Drain Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which Smart Reply proposal should launch after comparing the one-year Total Cost of Ownership (TCO) and the federated option's battery drain impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Choose Federated Learning. Its 1-year TCO of $710k is higher, but the privacy guarantees are worth the user battery drain of ~1.5%.", "Choose Centralized. Its 1-year TCO is $210k lower, and the Federated Learning option's 2.22% daily battery drain exceeds the 2% user-acceptance threshold.", "Choose Federated Learning. Its server-side operational costs are 6x lower, making it cheaper in the long run, and the battery impact is 0.22%.", "Choose Centralized. Federated Learning 1-year TCO is $630k, which is more expensive than $500k."], "correct_index": 1}}, {"id": "mobile-0568", "title": "Centralized vs Federated Cabin Pre-Warming TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For the cabin pre-warming feature, which is cheaper over one year: centralized training or federated learning after infrastructure and engineering operations are included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Federated Learning is cheaper because the weekly model updates result in less data transfer cost than daily user logs.", "Centralized is cheaper because training one large model is more compute-efficient than aggregating millions of small ones.", "Centralized is ~7x cheaper; the primary driver is the steep engineering and operational overhead of the Federated Learning system.", "Federated Learning must be chosen because its privacy benefits are paramount and the company should bear any cost."], "correct_index": 2}}, {"id": "mobile-0569", "title": "The Federated Fleet TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which Driver Alertness architecture has lower first-year TCO, centralized cloud inference or decentralized federated learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Centralized, because the $1.25M engineering cost for FL is too high to overcome.", "B) They are roughly equivalent in cost, so the decision should be based on model accuracy alone.", "C) Decentralized (FL), as its high initial engineering cost is less than the massive, recurring data transfer and compute costs.", "D) Centralized, because data ingress costs are negligible and modern GPUs are highly efficient."], "correct_index": 2}}, {"id": "mobile-0570", "title": "Differentially Private Drowsiness Detection TCO", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For 1M driving-app DAU, is cloud inference or DP on-device FL cheaper after infra and battery churn?", "chain_ids": ["mobile-chain-auto-secondary-008-03"], "chain_positions": {"mobile-chain-auto-secondary-008-03": 2}, "chain_tiers": {"mobile-chain-auto-secondary-008-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["DP federated learning is too expensive because privacy noise always requires far more training rounds than cloud inference.", "Cloud inference is cheaper because each face crop is only 50 KB, so upload and GPU costs are negligible.", "DP on-device FL costs about $156K/year with the stated churn allowance, while cloud inference is about $311K/year.", "The two options are roughly equivalent because the 0.055% daily battery use necessarily causes at least 0.5% churn."], "correct_index": 2}}, {"id": "mobile-0571", "title": "The Route Prediction TCO Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which Smart Routes proposal is more cost-effective over time once cloud costs, FL infrastructure, and battery churn are included?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Proposal A is cheaper. The battery drain cost for Proposal B is over $700k because the energy calculation was not converted from minutes to hours.", "Proposal B is significantly cheaper (by over $60k), as the only cost is the user battery impact; the server cost is negligible.", "Proposal B is cheaper in the long run (saving ~$19,100/year in recurring costs), despite higher Year 1 CapEx. The cloud training cost for A is significantly higher than the combined FL server and fleet-wide battery 'churn' cost for B.", "Proposal A is cheaper by ~$58,000. The Federated Learning 'churn cost' alone is higher than the entire cloud budget."], "correct_index": 2}}, {"id": "mobile-0572", "title": "The GPU Context Switch Overhead", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where are the missing 8ms going?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0573", "title": "The On-Device LLM Feasibility Check", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Walk through the feasibility analysis — memory, compute, latency, and battery — to assess whether a 3B on-device chatbot can meet phone constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0574", "title": "The NPU Delegation Failure Modes", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is 87% NPU delegation potentially worse for latency than 0% delegation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0575", "title": "The ANE vs GPU Power Efficiency", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you compare NPU and GPU TOPS per watt, and when might the GPU still be preferable despite lower efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0576", "title": "CoreML CPU Fallback vs Metal Compute Shaders", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "When is rewriting a custom layer in Metal compute shaders faster than CoreML, and when does it backfire?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0577", "title": "Whisper RTF vs Streaming Latency on Mobile", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do RTF and chunk buffering determine whether Whisper-tiny can work for low-latency streaming ASR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0578", "title": "The NNAPI Fragmentation Problem", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the same TFLite INT8 model exhibit a massive 3x latency regression (22ms vs 6ms) on a specific flagship Exynos NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0579", "title": "The Transformer vs CNN on Mobile", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is ViT-Small disproportionately slower than MobileNetV3 on Snapdragon 8 Gen 3 despite having only 21x more FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0580", "title": "The ANE Delegation Regression", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Core ML detector regress from 3 ms on A17 Pro ANE to 30 ms CPU inference after an iOS update?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 2}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0581", "title": "The Pocket Oven LLM", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the thermal budget and design a system that prevents overheating?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0582", "title": "The Noisy Environment Speech Failure", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you fix speech recognition noise robustness within a 50 MB mobile model budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0583", "title": "The Real-Time Video ML Frame Drop", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What's consuming the other 21ms per frame and causing the drop to 15 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0585", "title": "The Data Starvation NPU", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 10 TOPS Android NPU deliver only 2 TOPS on the detector, and how would you confirm memory bandwidth starvation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0587", "title": "The 15 FPS Video ML Bottleneck", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can't three 30 FPS models run at 30 FPS together?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0588", "title": "Snapdragon SLC Eviction and NPU Latency Spikes", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Snapdragon 8 Elite NPU latencies spike from 5ms to 18ms while the CPU decodes 4K H.265 video, considering the shared 6 MB system-level cache (SLC)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0589", "title": "The MediaTek Dimensity APU Architecture", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a QNN-optimized speech model run 13x slower after switching from Hexagon to MediaTek NeuroPilot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0590", "title": "TFLite Deployment Across Fragmented Exynos NPUs", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you deploy one feature across three Exynos generations with different NPU operator support?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0591", "title": "The CoreML vs TFLite Performance Gap", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What explains Core ML's 1.8 ms MobileNetV3 latency on iPhone versus TFLite's 5.2 ms on Android?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0592", "title": "The ISP Format Conversion Bottleneck", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 5ms NPU model cap at 12 FPS when 4K frames are resized through OpenCV on the CPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0593", "title": "The Double-Precision Mobile Tax", "topic": "extreme-quantization", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does using 0.5 instead of 0.5f slow ARM NEON bounding-box postprocessing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0594", "title": "The WebView WebGL Throttle", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What WebView architecture bottleneck causes WebGL inference to run slowly despite enough GPU math throughput?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0595", "title": "CPU-GPU Pipeline Serialization with glFinish", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What pipeline rule did you break?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 3}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0596", "title": "The Native Bridge Array Copy", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does passing an image over the React Native bridge add 50ms, and how can it be bypassed for video?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0597", "title": "The Fusion Illusion", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the interaction between the CoreML runtime, the A17 Pro's hardware, and the pruned model to explain this disappointing result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0598", "title": "The Pruning Paradox", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can structured attention-head pruning run faster than more aggressive unstructured pruning on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0599", "title": "The Delegate Dilemma", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do 20 NPU-CPU graph partitions cause a 5x slowdown for a single custom operator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0600", "title": "The Sparse Illusion", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the trade-offs and justify which approach is superior for a mobile deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0601", "title": "The Generative Keyboard's Hidden Tax", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much latency can fusing LayerNorm, GeLU, and Add save for a generative keyboard?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0602", "title": "The Speculative Speedup", "topic": "speculative-decoding", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can speculative decoding speed up a mobile 7B LLM despite adding a second draft model?", "chain_ids": ["mobile-chain-auto-secondary-017-32"], "chain_positions": {"mobile-chain-auto-secondary-017-32": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0604", "title": "The Cloud-Native Fallacy on Mobile", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the mobile SoC shared-memory transfer of a 512MB latent tensor still a serious power and latency problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0605", "title": "The Accelerator Selection Conundrum", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Given the heterogeneous nature of modern mobile SoCs (CPU, GPU, NPU), which accelerator would you primarily target for each model, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0607", "title": "The Budget Phone Mystery", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is there a 10x performance gap between a flagship and a budget phone when both advertise NPU acceleration?", "chain_ids": ["mobile-chain-bucket-roofline-03"], "chain_positions": {"mobile-chain-bucket-roofline-03": 3}, "chain_tiers": {"mobile-chain-bucket-roofline-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0608", "title": "The Streaming ASR Trade-off", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might the smaller RNN-T model be the right choice for live mobile captions despite lower benchmark accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0609", "title": "The Shared GPU Contention", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do a 12ms ML GPU kernel and 4ms UI render still drop frames, and what scheduling strategy prevents jank?", "chain_ids": ["mobile-chain-auto-012-04"], "chain_positions": {"mobile-chain-auto-012-04": 4}, "chain_tiers": {"mobile-chain-auto-012-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0610", "title": "The NPU Utilization Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 3B INT4 LLM on A18 Pro reach 100% Neural Engine utilization but only 15 tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0611", "title": "The Inference Timing Jitter", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What would you identify as the sources of timing variance and design a system that guarantees sub-10ms P99?", "chain_ids": ["mobile-chain-auto-019-05"], "chain_positions": {"mobile-chain-auto-019-05": 1}, "chain_tiers": {"mobile-chain-auto-019-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0612", "title": "Heterogeneous CPU NPU GPU Scheduling for an AR Pipeline Under 2W", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you approach scheduling and resource allocation across these different compute units?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0614", "title": "The CoreML ANE Fallback", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can using the dedicated AI accelerator make the model slower?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0615", "title": "The Hardware-Aware NAS for Mobile", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should NAS target both sub-5ms Apple latency and sub-8ms Qualcomm latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0616", "title": "The 60 FPS Camera ML Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you deliver a smooth 60 FPS experience when the model takes longer than the frame time?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 4}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0617", "title": "The Depthwise Memory Bound", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why didn't the 8x reduction in math translate to an 8x reduction in time?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 3}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0618", "title": "The Google Tensor G4 TPU Trade-off", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the team choose Pixel 9 Pro Tensor G4 or Snapdragon 8 Gen 3 based only on 27 versus 45 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0620", "title": "The JNI Boundary Crossing", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is JNI doing that consumes 13ms of overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0621", "title": "The CoreML Neural Engine Fallback", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did CoreML silently reject the Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0623", "title": "The Metal Shader Threadgroup Limit", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a Metal shader with 1024-thread groups run on newer hardware but crash on older devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0624", "title": "The Quantization Slowdown Paradox", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the potential causes of this system-level slowdown, despite the core operation getting faster?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0625", "title": "The Performance Cliff", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can adding unsupported Dynamic Kernel blocks collapse ANE performance despite only a 5% MAC increase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0626", "title": "The Thermal Throttling Paradox", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does forcing maximum NPU performance result in worse sustained average latency than OS-managed clocks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0627", "title": "The Operator Fusion Fallacy", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is aggressive quantization from INT8 to INT4 likely to fail to improve latency here, and what is the true underlying system bottleneck?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0628", "title": "The Speculative Decoding Memory Trap", "topic": "speculative-decoding", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does this time-saving algorithm lead to catastrophic memory failure on a mobile device, and what specific hardware constraint is being violated?", "chain_ids": ["mobile-chain-auto-secondary-017-32"], "chain_positions": {"mobile-chain-auto-secondary-017-32": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0629", "title": "The Pruning vs. Distillation Dilemma", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is 50% unstructured pruning a poor path to the 16ms mobile deadline, and what compression technique should replace it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0630", "title": "The Speculative Pruning Paradox", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does pruning the draft model before combining it with speculative decoding make the mobile LLM worse than speculative decoding alone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0631", "title": "The Night-Blind Driver Monitor", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate this suggestion and predict the true, underlying reason for this catastrophic, light-dependent failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0632", "title": "The Multi-Node Latency Catastrophe", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did latency increase non-linearly instead of scaling gracefully, and what specific hardware interaction is the likely cause?", "chain_ids": ["mobile-chain-auto-secondary-017-08"], "chain_positions": {"mobile-chain-auto-secondary-017-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0633", "title": "The Voice Assistant That Froze The Speedometer", "topic": "real-time-deadlines", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Assess the situation: Why would an LLM scheduling optimization, designed to improve throughput, cause a catastrophic failure in the hard-real-time instrument cluster?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 3}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0634", "title": "The Real-Time Batching Paradox", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a throughput-oriented batching window violate the real-time translation latency budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0635", "title": "Topology-Oblivious MoE Routing on a Mobile SoC", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does topology-oblivious MoE routing degrade on-device expert communication speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0636", "title": "The Heterogeneous Orchestrator", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you outline a system-level strategy for orchestrating these heterogeneous compute units, addressing potential bottlenecks and power concerns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0637", "title": "Snapdragon Heterogeneous Operator Placement", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Conv2D, custom attention, GELU, and dynamic control flow run across Hexagon NPU, Adreno GPU, and Kryo CPU to minimize latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0638", "title": "The Big.LITTLE Synchronization Trap", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does limiting execution to 4 threads outperform 8 threads for inference on a big.LITTLE CPU architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0639", "title": "Federated Learning Stragglers from Device Heterogeneity", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does device heterogeneity break federated learning, and how do you fix the round completion rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0640", "title": "Architecting a Multi-Model On-Device AI Assistant", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system orchestrate and schedule the wake word, ASR, 3B LLM, TTS, and vision models given the memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0641", "title": "Building a Hardware-Adaptive Inference Engine", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a cross-platform inference engine that automatically adapts to each SoC's strengths without maintaining 5 separate model variants?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0642", "title": "Architecting an On-Device 3B LLM on an 8GB Phone", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect memory, inference, and UX to run a 3B parameter LLM at ≥20 tokens/sec on an 8GB phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0643", "title": "The Multi-Modal Sensor Fusion System", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you architect the fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0645", "title": "The On-Device LLM Keyboard Power Drain", "topic": "speculative-decoding", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should speculative decoding be used to meet the AI keyboard's latency and sub-1W power requirements?", "chain_ids": ["mobile-chain-auto-secondary-017-32"], "chain_positions": {"mobile-chain-auto-secondary-017-32": 2}, "chain_tiers": {"mobile-chain-auto-secondary-017-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0646", "title": "The 'Live Scribe' Concurrency Crisis", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What scheduling architecture is needed for Live Scribe to manage NPU contention across streaming and interactive requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0647", "title": "The Automotive Assistant's Priority Inversion", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architecture avoids priority inversion for an in-car assistant handling both urgent commands and long conversations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0648", "title": "The Sentient Dashboard: Compressing a Foundation Model for Real-Time Driver Monitoring", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What multi-stage compression strategy can productionize a 10B foundation model to meet strict real-time safety deadlines?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0649", "title": "The Phantom Reboot: Designing a Resilient Automotive ML Watchdog", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should the drowsiness detector recover from NaN freezes without triggering the vehicle control watchdog reboot?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0650", "title": "The SolarSentry Dashcam Meltdown", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What dynamic architecture lets SolarSentry detect parked-car threats within thermal and battery constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0651", "title": "The Near-Miss Privacy Paradox", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid near-miss learning system minimizes cellular transfer cost while preserving privacy and safety validation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Stream all sensor data to the cloud via cellular network 24/7 to guarantee the highest fidelity training dataset.", "Run a hybrid system: local models flag potential near-misses but only transmit anonymized, compressed metadata.", "Deploy an untested, unvalidated federated learning model directly to the control systems of 1 million cars.", "Store all near-miss data locally until the user connects their car to an active Wi-Fi connection."], "correct_index": 1}}, {"id": "mobile-0652", "title": "The Sun-Soaked Sentry Problem", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three architectural decisions to meet these conflicting thermal, power, and performance requirements on an automotive SoC?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0653", "title": "The On-Device 7B LLM Mandate", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would compression, KV limits, and fused kernels meet a 2GB on-device budget, and when should a student-verifier replace the raw 7B?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0654", "title": "Deploying a 7B Automotive LLM Within 2GB", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the three main pillars of your technical plan to bridge this gap, how do they interact, and what are the expected quantitative gains from each?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 3}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0655", "title": "Preemptive NPU Scheduling for Guardian Copilot", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What preemptible scheduling architecture prevents low-priority Guardian copilot work from blocking safety-critical tasks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0656", "title": "The Sentry Mode Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hierarchical sensing and duty-cycling architecture meets the 72-hour Sentry Mode power, thermal, and responsiveness constraints?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 4}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0657", "title": "The Autonomous Dashcam's Thermal Budget", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why is continuous powerful vision inference infeasible for the autonomous dashcam, and what duty-cycled architecture is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0658", "title": "The AutoScribe Jank Crisis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should transcription and LLM summarization be scheduled to avoid UI freezes and meet real-time deadlines?", "chain_ids": ["mobile-chain-auto-012-03"], "chain_positions": {"mobile-chain-auto-012-03": 4}, "chain_tiers": {"mobile-chain-auto-012-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0659", "title": "The In-Car LLM Mandate", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What portfolio of model and systems optimizations makes the 7B in-car AI Co-Pilot feasible without cloud connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0660", "title": "The On-Device Copilot Power Budget", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "What multi-model speculative decoding system can meet the always-on phone assistant's latency and power targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0661", "title": "The Sentient Dashcam: Designing for Hostile Automotive Environments", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What adaptive control system should manage Sentry Mode power and thermals while meeting a 5-second detection deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0662", "title": "The AR Navigation Preemption Crisis", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you schedule a 400ms VLM inference without starving a 30 FPS navigation task on a non-preemptible NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0663", "title": "Android TFLite Model Loading OOM on a 4 GB Phone", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Your models only use 23 MB — how can that cause OOM on a 4 GB device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0664", "title": "The Mobile LLM KV-Cache Squeeze", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What grows until the OS jetsams a 3B INT4 chat app after 10 or more turns?", "chain_ids": ["mobile-chain-auto-014-20"], "chain_positions": {"mobile-chain-auto-014-20": 0}, "chain_tiers": {"mobile-chain-auto-014-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0665", "title": "The Unified Memory Architecture Advantage", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural difference lets Apple's unified memory avoid copies that Qualcomm's shared DRAM still requires?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0666", "title": "Why a 1.2 GB Core ML Model Gets Jetsam-Killed on iOS", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a 1.2GB CoreML model be killed under iOS memory pressure, and how should weights be loaded to avoid Jetsam?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 2}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0668", "title": "The Budget Phone Crash", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 15 MB model crash a 4 GB phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0669", "title": "The \"Small Model, Big Latency\" Puzzle", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is a common, often overlooked factor causing this high latency on the CPU, and how would you investigate it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0670", "title": "The Quantization Bandwidth Boon", "topic": "extreme-quantization", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is a primary reason for this limited latency improvement on a memory-bound mobile NPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0671", "title": "The OOM Crash on Older iPhones", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the 28 MB model cause out-of-memory crashes on 4 GB RAM devices, and how can it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0673", "title": "The Silent Eviction", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does returning to an iOS app with a 400 MB mmap CoreML model cause a 3-second freeze?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0674", "title": "The Memory Bandwidth Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is Model A consistently slower than Model B on a mobile NPU despite having the same total MAC operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0675", "title": "The Memory Bandwidth Throttling", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 1080p segmentation model drop from 30 FPS to 12 FPS when screen recording starts on the same memory bus?", "chain_ids": ["mobile-chain-auto-012-01"], "chain_positions": {"mobile-chain-auto-012-01": 3}, "chain_tiers": {"mobile-chain-auto-012-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0676", "title": "The Memory Map (mmap) Page Fault Freeze", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mmap of a 150 MB model freeze the first inference for 1.2 seconds but not later passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0677", "title": "The LPDDR5X Bandwidth Budget", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Is the memory bandwidth sufficient for real-time token generation at 30+ tokens/second?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 1}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0678", "title": "The On-Device Training Storage Bloat", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where did the 4.2 GB of storage use come from during on-device fine-tuning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0679", "title": "The On-Device Vector Search L2 Mismatch", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an HNSW search over unnormalized 384-dim note embeddings return tax documents for an Italy vacation query?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0680", "title": "The Invisible OOM Crash", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 100 MB Android ML model crash with OOM when there is 500 MB of free physical RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0681", "title": "The Phantom OOM Crash", "topic": "extreme-quantization", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can loading a 500 MB model OOM intermittently even when the phone reports gigabytes of free RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0682", "title": "The Memory-Mapped Page Fault", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mmap of a 100 MB TFLite model freeze the UI for 800 ms on the first Android inference?", "chain_ids": ["mobile-chain-auto-014-17"], "chain_positions": {"mobile-chain-auto-014-17": 0}, "chain_tiers": {"mobile-chain-auto-014-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0683", "title": "The Camera Pipeline Memory Contention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does camera-preview segmentation rise from 8 ms to 14 ms when the ISP uses LPDDR5?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0684", "title": "The Mobile Memory Controller Puzzle", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where is the missing 44% of theoretical LPDDR5X memory bandwidth during generation?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 2}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0686", "title": "The Memory-Mapped Weight Strategy", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can mmap cut a 300 MB model startup from 3s to under 500ms without shrinking the model?", "chain_ids": ["mobile-chain-auto-014-17"], "chain_positions": {"mobile-chain-auto-014-17": 1}, "chain_tiers": {"mobile-chain-auto-014-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0687", "title": "The On-Device RAG Memory Budget", "topic": "compound-ai-systems", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does a Galaxy S24 Ultra on-device RAG stack fit with a 3B INT4 LLM and 500K 768-dim embeddings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0688", "title": "The Multi-Model Memory Sharing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much memory does a shared-backbone architecture save on a 1.5 MB INT8 backbone, and what runtime tradeoff occurs on the Apple ANE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0689", "title": "The On-Device Fine-Tuning Corruption", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can you personalize a 100M image classifier without dropping original accuracy from 85% to 12%?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0690", "title": "The On-Device Image Generation Memory Wall", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the app get jetsammed at denoising step 12 even though 1.84 GB of weights fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0691", "title": "Quantization Strategy for On-Device Updates", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "When should a mobile segmentation team choose QAT over DRQ, and when is DRQ better for OTA model iteration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0692", "title": "Unpredictable Latency Spikes", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose and mitigate these latency spikes, focusing on memory management within the ML inference pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0693", "title": "The DRAM Bandwidth Contention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose and mitigate memory bandwidth contention between concurrent ML models and UI rendering?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 2}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0694", "title": "The JNI Object Pinning Death", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does JNI GetByteArrayElements make Android UI drop to 2 FPS after pinning 3 MB camera frames at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0695", "title": "The SLC Cache Eviction", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does UI scrolling ruin the ML power budget of an always-on audio classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0696", "title": "The On-Device LLM Memory Architecture", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can a 3B FP16 LLM run on a phone when weights need 6 GB and the OS leaves 5 GB free?", "chain_ids": ["mobile-chain-auto-014-20"], "chain_positions": {"mobile-chain-auto-014-20": 2}, "chain_tiers": {"mobile-chain-auto-014-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0697", "title": "The Custom Allocator Architect", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a custom memory allocator optimized for the deterministic nature of ML inference workloads on mobile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0699", "title": "The CoreML Multi-Array Pre-allocation", "topic": "real-time-deadlines", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you eliminate per-frame MLMultiArray allocation spikes in a 60 FPS Core ML video pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0700", "title": "The Quantization Quirk", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why would some operations run in FP32 on the CPU despite the model being quantized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0701", "title": "The Conversion Precision Loss", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which layer type is most likely the culprit, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0702", "title": "The Zero-Point Drift", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a TFLite INT8 activation decode incorrectly on-phone after a PyTorch model tested at 95% accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0703", "title": "The Cross-SoC Accuracy Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the same INT8 face verification model have 0.1% FAR on Pixel 8 Pro but 1.2% on Galaxy S23?", "chain_ids": ["mobile-chain-auto-017-05"], "chain_positions": {"mobile-chain-auto-017-05": 0}, "chain_tiers": {"mobile-chain-auto-017-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0704", "title": "The Cross-Platform Confidence Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where do CoreML and TFLite INT8 conversion differences make iOS show 87% confidence while Android shows 71%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0705", "title": "The Adaptive Precision Challenge", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you adapt your quantization strategy to maintain accuracy while still leveraging INT8 performance on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0706", "title": "The Quantization Fragmentation Trap", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can the same INT8 model produce different results on different SoCs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0707", "title": "The Neural Engine Quantization Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What quantization choice should be used on Apple ANE versus Qualcomm Hexagon, and where is each platform's cliff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0708", "title": "The Quantization Conundrum", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What architectural reasons within the NPU or its surrounding SoC might explain this counter-intuitive result?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0709", "title": "The Int8 Quantization Activation Clipping", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What property of mobile activation functions causes Layer 5 activations to saturate at 127 after INT8 quantization on a mobile DSP?", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 1}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0710", "title": "Core ML Mixed Precision for Neural Engine with FP32 Accuracy", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a mixed-precision strategy that gets Neural Engine speed with FP32 accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0711", "title": "W4A16 vs W4A4 Quantization for On-Device LLMs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is W4A16 a better quantization scheme than W4A4 despite keeping activations at 16-bit precision?", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 2}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0712", "title": "The Quantization Divergence Across SoCs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can the same INT4 Gemma-2B weights produce different text on A18 Pro, Snapdragon 8 Gen 3, and Tensor G4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0713", "title": "The Cross-SoC Quantization Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does INT8 precision drop by 6% on the Exynos 2200, and how do you fix it without maintaining three separate models?", "chain_ids": ["mobile-chain-auto-017-05"], "chain_positions": {"mobile-chain-auto-017-05": 1}, "chain_tiers": {"mobile-chain-auto-017-05": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0714", "title": "The INT4 Accuracy Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the significant technical challenges you anticipate with INT4 quantization compared to INT8, and how would you mitigate them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0715", "title": "The Granular Precision Architect", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a hardware-aware mixed-precision quantization strategy for a mobile generative model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0716", "title": "The Cross-Platform Confidence Score Divergence", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why can the confidence scores diverge by 0.45, and how do you reduce and verify cross-platform consistency for safety-critical applications?", "chain_ids": ["mobile-chain-auto-017-05"], "chain_positions": {"mobile-chain-auto-017-05": 2}, "chain_tiers": {"mobile-chain-auto-017-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0717", "title": "The Delegation Lottery", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does adding a single TFLite Flex post-processing op raise the inference latency from 4ms to 38ms?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0718", "title": "The Heterogeneous Scheduling Trap", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where does the pipeline spend 28ms if the NPU work accounts for under 0.5% of peak performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0719", "title": "The Mobile GPU Misconception", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the GPU 3× slower than the NPU for this workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0720", "title": "The SoC Interconnect Bottleneck", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is eating the other 9ms in the pipelined execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0721", "title": "The Qualcomm QNN SDK Delegation", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should QNN delegate each model operation across NPU, GPU, and CPU to minimize latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0722", "title": "The ANE Delegation Disaster", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the same Core ML segmentation model run in 6 ms on the A17 Pro but 69 ms on the older A15 Bionic?", "chain_ids": ["mobile-chain-auto-secondary-007-12"], "chain_positions": {"mobile-chain-auto-secondary-007-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0723", "title": "The Dilated Convolution Penalty", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the dedicated AI hardware 4x slower than the general-purpose CPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0724", "title": "The UI Contention Crisis", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an iPhone 15 Pro 7B local LLM drop from 25 to 5 tokens/sec when users scroll a 3D interface?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0725", "title": "The NPU Compiler Black Box", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do three compilers targeting three NPUs produce different partitioning decisions from the same ONNX graph, and how do you debug this?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0726", "title": "The Mobile AI Chip Roadmap Bet", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you build a model deployment pipeline that survives this hardware fragmentation without maintaining 5 separate codepaths?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0727", "title": "The Interconnect Choke Point", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What SoC-level component likely causes the discrepancy between sum of stage times and end-to-end latency, and how is its impact quantified?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0728", "title": "The Asymmetric Multiprocessing (Big.LITTLE) Stutter", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architectural feature of mobile CPUs causes this massive latency variance, and how can you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0729", "title": "The \"Unaccelerated Custom Op\" Dilemma", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you accelerate a GNN custom aggregation op that causes a 20 ms CPU spike on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0730", "title": "The ISP/NPU Hardware Synchronization", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "What hardware synchronization mechanism prevents an NPU from reading an ISP hardware buffer before DMA writes are visible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0736", "title": "Zero-Shot UI Action Grounding", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design the on-device grounding architecture and the handoff protocol under these strict memory and latency constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0737", "title": "On-Device Vector Store with CoreML Embeddings and LDP", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you architect the local vector store, the CoreML embedding pipeline, and the LDP mechanism?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0738", "title": "The Semantic Router", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What router architecture meets <10 ms latency, 5-turn context, <50 MB RAM, and Neural Engine execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0739", "title": "CoreML vs PyTorch", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Can you deploy a raw .pt or .pth PyTorch model directly into a native iOS Swift application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, iOS natively executes raw PyTorch files (.pt) using the Swift Torch API.", "Yes, but it requires FP16 conversion via `torch.half()` before loading.", "No, the model must be converted to the CoreML format for hardware acceleration on iOS.", "No, iOS only natively supports TensorFlow Lite models (.tflite)."], "correct_index": 2}}, {"id": "mobile-0740", "title": "Background Inference Limits", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the primary risk of running heavy ML inference in the background on iOS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model will automatically downgrade to 1-bit quantization.", "iOS will suspend or terminate unsupported long-running background inference.", "The user's screen will freeze until the background task completes.", "The App Store will reject the app during the review process."], "correct_index": 1}}, {"id": "mobile-0741", "title": "The Launch Blocker", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on the launch timeline, why are you seeing a 95% abandonment rate?", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0742", "title": "The Operator Gap", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the DMA flow, why do two unsupported ops cause a 50% latency penalty in a mobile NPU deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0743", "title": "The Jetsam Guillotine", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the RAM diagram, what is the 'invisible' resource consumer causing the OS to kill your process?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 3}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0744", "title": "The Burst Benchmarking Illusion", "topic": "real-time-deadlines", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the performance timeline, what physical protection mechanism is engaging inside the smartphone?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 3}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0745", "title": "The Backbone Bloat", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the VRAM diagram, what is the 'efficiency gap' in your model loading strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0746", "title": "The Frankenstein Model", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "After a reboot interrupts a background model update, why can the app launch but produce garbage outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0747", "title": "The CPU Wake-Lock Tax", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What battery drain mechanism explains the wake-word app's high energy use despite the model's low CPU cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0748", "title": "The Silicon Shared Oven", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the SoC thermal envelope, why is the 3D game affecting the NPU speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0749", "title": "The Radio Energy Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the power breakdown, where is your optimization effort being wasted?", "chain_ids": ["mobile-chain-bucket-powerbud-06"], "chain_positions": {"mobile-chain-bucket-powerbud-06": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0750", "title": "The UMA Bandwidth Wall", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Based on the Unified Memory Architecture, why does a faster 120Hz screen slow down your AI models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0751", "title": "The ANE Efficiency Advantage", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the compute efficiency in TOPS per watt, and why does this metric matter more than raw TOPS for a mobile device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.5 TOPS/W. Mobile chips sacrifice efficiency for portability.", "7.0 TOPS/W. The battery is the binding constraint, so efficiency determines the thermal limit.", "35.0 TOPS/W. The 5W figure represents idle power.", "175.0 TOPS/W. NPUs achieve superlinear scaling at INT8."], "correct_index": 1}}, {"id": "mobile-0752", "title": "The 48MP Camera Firehose", "topic": "real-time-deadlines", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Before any ISP processing or compression, what is the raw data bandwidth flowing from the sensor to the SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~30 MB/s. Each 48MP frame is about 1MB compressed.", "B) ~288 MB/s. 48MP at 1 byte per pixel at 6 fps.", "C) ~2.9 GB/s. 48MP at 2 bytes per pixel at 30 fps, uncompressed.", "D) ~29 GB/s. 48MP at 20 bytes per pixel with full color depth."], "correct_index": 2}}, {"id": "mobile-0753", "title": "The On-Device Fine-Tuning Data Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much memory does this training dataset occupy, and does it fit comfortably in the unified memory of a mobile device alongside the model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~6 MB. JPEG images are small and efficient.", "B) ~60 MB. Each tensor is about 60 KB.", "C) ~600 MB. Each 224x224x3 FP32 tensor is ~600 KB, times 1,000 images.", "D) ~6 GB. Deep learning datasets always require gigabytes."], "correct_index": 2}}, {"id": "mobile-0754", "title": "The INT4 Quantization Payoff", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much memory does the INT4 model save, and what is the primary risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 0.75 GB savings. INT4 is only 25% smaller than INT8.", "B) 1.5 GB savings. INT4 uses half the bytes of INT8, but risks accuracy loss in sensitive layers.", "C) 3.0 GB savings. INT4 eliminates half the parameters entirely.", "D) No savings. INT4 still requires INT8 storage with a lookup table."], "correct_index": 1}}, {"id": "mobile-0755", "title": "The Battery Inference Budget", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is this feasible with inference alone, ignoring all other phone functions?", "chain_ids": ["mobile-chain-auto-012-02"], "chain_positions": {"mobile-chain-auto-012-02": 0}, "chain_tiers": {"mobile-chain-auto-012-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes. 18.5 Wh / 5W = 3.7h of inference, plus the OS is free.", "No. Inference alone drains the battery in 3.7h; with system overhead it is ~2.3h. Must reduce frame rate.", "Yes. The ANE uses only milliwatts for neural inference.", "No, but only because the phone will thermal-throttle first."], "correct_index": 1}}, {"id": "mobile-0756", "title": "The Depthwise Separable Convolution Dividend", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "By what factor does this reduce the multiply-accumulate operations per spatial position?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) ~2.0x (Trap: assuming it just splits one convolution into two equal halves).", "B) ~4.5x (Trap: omitting the pointwise projection cost).", "C) ~8.7x (Correct).", "D) ~256x (Trap: assuming it scales perfectly with channel depth alone)."], "correct_index": 2}}, {"id": "mobile-0759", "title": "The Vocabulary Embedding Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What fraction of a 3.5 GB INT4 model does the embedding table represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.75%", "~7.5%", "~15%", "~75%"], "correct_index": 1}}, {"id": "mobile-0760", "title": "The Speculative Decoding Gambit", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the effective token throughput with speculative decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 25 tok/s.", "B) 10 tok/s.", "C) 5 tok/s.", "D) 20 tok/s."], "correct_index": 1}}, {"id": "mobile-0761", "title": "The Cellular Model Delivery Problem", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the download time and propose a deployment strategy that respects mobile network constraints?", "chain_ids": ["mobile-chain-auto-secondary-005-14"], "chain_positions": {"mobile-chain-auto-secondary-005-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-005-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0762", "title": "The Federated Learning Upload Bill", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the total daily upload bandwidth and explain why naive federated learning is infeasible at scale without compression?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0764", "title": "The Privacy Budget Drain", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How does advanced composition improve this, and what is the practical implication for query budgeting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0765", "title": "The On-Device Training Memory Crisis", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How much memory does gradient checkpointing save for 32-layer FP16 activations, and what compute cost does it add?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0767", "title": "The Mobile MoE Memory Illusion", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why does top-2 routing not reduce the resident expert footprint to 0.5 GB on mobile, and what design alternatives would you consider if the 2 GB resident set is too large?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0768", "title": "The On-Device RLHF Memory Wall", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the specific memory stacks involved, and is on-device RLHF feasible within 8 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0769", "title": "The Replay Buffer Memory Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you calculate the buffer size and design a memory-aware eviction policy that prevents unbounded growth on a memory-constrained device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0770", "title": "The Million-Device Adapter Sync Storm", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the server-side egress bandwidth required and cost for syncing 10MB adapters every 5 minutes to 1 million users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0771", "title": "The Federated Gradient Compression Trade-Off", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the per-round upload volume and analyze the convergence trade-off of aggressive compression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0772", "title": "The Model Distillation Sync Budget", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a sync protocol that keeps the student model fresh without exceeding the 100 MB daily cellular data budget?", "chain_ids": ["mobile-chain-auto-secondary-005-14"], "chain_positions": {"mobile-chain-auto-secondary-005-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-005-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0773", "title": "The Silent Model Corruption Problem", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a verification system that detects such corruption before inference begins?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0774", "title": "The Thermal Throttling Adaptation Loop", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design an adaptive inference strategy that maintains acceptable user experience under thermal constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0775", "title": "Diagnosing Core ML ANE Fallback", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the architectural root cause of this execution pattern, and how can it be analyzed and resolved to achieve target performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0776", "title": "CoreML Execution Fallback Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total CoreML frame latency and sustainable FPS after adding ANE compute, CPU fallback, and two copy penalties?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0777", "title": "CoreML ViT ANE Fallback Evaluation", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which optimization path best fixes ViT CoreML ANE-GPU fallback and meets the 60 FPS frame budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0778", "title": "TFLite Delegate Graph Partitioning Trade-offs", "topic": "real-time-deadlines", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs and justify a design decision to meet both the 33ms latency and 1.5W power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0779", "title": "Diagnosing TFLite NNAPI Delegate Subgraph Fallback", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can enabling the TFLite NNAPI delegate make a MobileNetV3 model slower than CPU inference?", "chain_ids": ["mobile-chain-auto-019-04"], "chain_positions": {"mobile-chain-auto-019-04": 1}, "chain_tiers": {"mobile-chain-auto-019-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0780", "title": "TFLite Delegate Subgraph Partitioning Overhead", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If each CPU-GPU context switch (tensor copy and synchronization) takes 2ms, what is the expected new end-to-end latency?", "chain_ids": ["mobile-chain-auto-019-04"], "chain_positions": {"mobile-chain-auto-019-04": 0}, "chain_tiers": {"mobile-chain-auto-019-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0781", "title": "Edge Vision Model: CNN vs ViT Deployment", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architecture is better suited for this mobile INT8 NPU and small SRAM budget: MobileNetV2 or a Mobile ViT of equivalent accuracy, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0782", "title": "The NAS Latency Predictor Blind Spot", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did an NPU latency LUT predict 8ms but produce 22ms inside a full camera pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0783", "title": "Hedged Edge-Cloud Model Routing", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "When should the voice translation app launch a local hedged fallback to guarantee a 400ms P99 latency bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0784", "title": "Thermal-Aware On-Device Pipelining", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can asynchronous pipelining with bounded stage queues preserve steady-state 30 FPS throughput after GPU thermal throttling increases one stage to 25ms, even though end-to-end frame latency rises?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0785", "title": "Battery-Aware Sensor Batching", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What sensor batching window minimizes smartwatch CPU duty cycle while preserving fall detection within a 1000ms SLA?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0787", "title": "Speculative Decoding Performance Regression", "topic": "speculative-decoding", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does speculative decoding reduce mobile throughput despite a 65% draft acceptance rate?", "chain_ids": ["mobile-chain-auto-secondary-017-33"], "chain_positions": {"mobile-chain-auto-secondary-017-33": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0788", "title": "The Context Length Latency Spike", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does mobile chat decode latency jump exactly at a 1024-token context length?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0789", "title": "Multi-Turn Chat Degradation", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does multi-turn mobile chat TTFT degrade as history grows while decode speed stays constant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0790", "title": "The Code Generation Stall", "topic": "speculative-decoding", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does speculative decoding become slower than target-only decoding for Python code generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0791", "title": "The Background Batching Bottleneck", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does adding a background summarizer reduce total LLM throughput instead of improving batching efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0792", "title": "The INT8 KV Cache Penalty", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does quantizing the KV cache to INT8 save memory but increase decode latency and lower hardware utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0793", "title": "The PagedAttention System Call Trap", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a PagedAttention port with 16-token blocks cause CPU overhead and decode jitter on Android?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0794", "title": "AR Glasses Thermal Budgeting", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Does the AR glasses ML workload fit the 1.5W thermal envelope, and how should average power be calculated?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0795", "title": "Daily Carbon Footprint of Mobile Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the daily carbon footprint of the mobile federated learning approach including compute, 5G transfer, and cloud aggregation?", "chain_ids": ["mobile-chain-auto-013-05"], "chain_positions": {"mobile-chain-auto-013-05": 3}, "chain_tiers": {"mobile-chain-auto-013-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0796", "title": "Always-On Wake Word Battery Drain", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the percentage of battery drained over 24 hours for the DSP approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0797", "title": "LLM DVFS and Static Power Wall", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the energy per token for both the max frequency and 50% frequency states to determine if downclocking actually saves energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0798", "title": "Super-Res NPU vs 5G Streaming Energy", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the total energy saved (in Joules) by using the ML super-resolution approach over the 2-hour movie?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0799", "title": "Thermal Throttling in Background Indexing", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the maximum number of images the device can index per minute without violating the thermal limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0800", "title": "On-Device vs Cloud LLM Energy Cost", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much more total system energy (in Joules) does the on-device approach use compared to the cloud approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0802", "title": "Memory-Bound NPU Execution Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the layer take 0.5ms instead of the compute-bound 0.036ms estimate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0803", "title": "Unaligned Channel Compiler Fallback", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 13-channel Cortex-M4 convolution be 3x slower than a 16-channel convolution with more MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0804", "title": "Activation Function CPU Fallback", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did replacing ReLU with SiLU make the mobile NPU segmentation model jump from 20ms to 850ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0805", "title": "Dispatch Overhead in Tiny Models", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 50KB dense model run faster on a Cortex-A CPU than on the mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0806", "title": "Data Reuse in Depthwise Convolutions", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can a depthwise convolution with 10x fewer MACs be slower than a pointwise convolution on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0807", "title": "Activation Spilling and SRAM Overflow", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a high-resolution input make the Cortex-M4 model spike from 50ms to 500ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0808", "title": "SRAM Weight Pinning and Power Drain", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 10MB mobile NPU model draw unexpected power by repeatedly reading weights from DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0809", "title": "DMA Double-Buffering Overhead", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does DMA fail to hide memory latency when the Cortex-M4 waits for each tile before computing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0810", "title": "KV Cache Tiling and SRAM Thrashing", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does KV cache tiling latency grow quadratically as sequence length increases from 100 to 1000?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0811", "title": "Fake Quantization CPU Overhead", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why did INT8 quantization reduce MobileNet size but not CPU inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0812", "title": "INT8 Accumulator Overflow", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do all INT8 DSP intermediate tensors saturate to 127 and destroy accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0813", "title": "Asymmetric Quantization Runtime Overhead", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does asymmetric INT8 quantization increase mobile CPU latency compared with symmetric quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0814", "title": "Mobile New 0014", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a mixed INT8/INT16 mobile network run slower than an all-INT16 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0815", "title": "Per-Channel Quantization Memory Stalls on DSP", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does per-channel INT8 quantization stall DSP vector ALUs compared to per-tensor quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0816", "title": "APK Compression Breaking Zero-Copy Mmap", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a compressed 20MB TFLite model freeze the Android UI for 4 seconds during first initialization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0817", "title": "Selective Building for Edge ML Binaries", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 2MB keyword spotting model produce a 115MB IoT binary, and how should the runtime be built instead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0818", "title": "Memory Fragmentation Causing NPU OOM Errors", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does loading a 50MB model fail with an OOM error when 120MB of free RAM is available?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0819", "title": "ZIP Compression Breaks OTA Binary Deltas", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can changing one bias value make an OTA delta for a 100MB model nearly as large as the full model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0820", "title": "DRAM Power Bottleneck in High-FPS Edge Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the AR headset drain battery despite only 20% NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0821", "title": "Polling vs Event-Driven Execution in Always-On MCU", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the always-on Cortex-M4 wake-word detector die in days instead of lasting 30 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0822", "title": "Thermal Leakage Power Runaway in Mobile SoCs", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does SoC power rise from 3W to 4.5W over time despite constant MACs per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0823", "title": "CPU Preprocessing Overhead in Edge Vision Pipelines", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the drone vision pipeline consume high baseline power before NPU inference starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0824", "title": "Semantic Gating for Always-On Object Detection", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the empty-hallway security camera drain battery as fast as a busy scene?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0825", "title": "DVFS Thermal Throttling in Continuous Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Orin tracking latency abruptly double after four minutes despite normal ambient temperature?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0826", "title": "Ergonomic Skin Temperature Limits on Mobile NPUs", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the smartphone OS throttle the NPU even though die temperature is only 50C?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0827", "title": "Thermal Saturation in Passively Cooled AR Headsets", "topic": "real-time-deadlines", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a passively cooled AR headset gradually fall from 60 FPS to 15 FPS in a one-hour test?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0828", "title": "Ambient Temperature Impact on Passively Cooled Edge Servers", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an edge server in a closed metal box reboot during the day but work at night?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0829", "title": "Memory Bus Contention Between Concurrent Edge Models", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the audio wake-word model miss deadlines only when video upscaling runs concurrently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0830", "title": "UI Thread Preemption of Mobile ML Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does on-device translation token latency spike when the user scrolls quickly through the UI?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0831", "title": "Kernel Fusion Misses and Memory I/O Latency", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an unfused Conv-BatchNorm-ReLU block take 1.8ms instead of one 0.5ms GPU kernel?", "chain_ids": ["mobile-chain-auto-secondary-007-13"], "chain_positions": {"mobile-chain-auto-secondary-007-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-007-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0832", "title": "Layout Mismatch Transpose Overheads in TFLite", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does TFLite insert expensive Transpose operations after converting a PyTorch model for a mobile DSP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0833", "title": "NPU Memory Tiering Specification", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a layer execution and weight-tiering specification to maximize the auto-regressive token generation rate, detailing how to utilize the fast SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0836", "title": "Mixed-Precision Mobile Super-Resolution", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the mechanism of the accuracy drop, and what mixed INT8/FP16 execution graph specification resolves it while preserving throughput?", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 3}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0840", "title": "Mitigating Mobile DRAM Thermal Throttling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate an architectural change to the model's execution strategy to eliminate off-chip intermediate activation memory traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0850", "title": "Analyze MobileNetV3 latency on A17 Pro ANE vs GPU execution path", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the ANE 2.3x faster than the GPU for MobileNetV3, and which specific operations benefit most?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0851", "title": "Compare EfficientNet-Lite0 and MobileNetV3 per-inference energy on Snapdragon 8 Gen 3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does EfficientNet-Lite0 compare to MobileNetV3-Large in energy consumption per inference on the Snapdragon 8 Gen 3 Hexagon DSP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0852", "title": "Diagnose accuracy regression after Core ML conversion of EfficientNet-B0 for A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the sources of the 2.3pp accuracy regression?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0853", "title": "Evaluate depthwise separable vs standard conv for on-device training on A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate whether depthwise separable convolutions or standard convolutions in these final blocks better balance training speed and accuracy for on-device learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0854", "title": "Evaluate EfficientNet-Lite vs MobileNetV3 for CoreML ANE deployment accuracy-latency", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which CNN backbone should be selected for an accuracy-critical iOS app running on the ANE, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0855", "title": "Fluency: explain MobileNet-family tradeoffs to an iOS product manager", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the MobileNet vs ResNet-50 accuracy, speed, and battery tradeoffs to a product manager?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0856", "title": "Fluency: describe inverted residual block execution on Qualcomm Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a MobileNetV2 inverted residual block map to Hexagon HTP, and when is expansion ratio 6 too large?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0857", "title": "Implement MobileNetV3 fine-tuning pipeline for Core ML deployment on A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should you fine-tune and convert MobileNetV3-Large so it runs on the Neural Engine under 3ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0858", "title": "Implement EfficientNet-B0 for Snapdragon 8 Gen 3 Hexagon NPU via QNN SDK", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should EfficientNet-B0 be converted and optimized for Snapdragon 8 Gen 3 Hexagon deployment with the QNN SDK?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0859", "title": "Mastery: explain compound scaling failure modes for mobile deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Does EfficientNet-B3 fit the unchanged 5ms mobile NPU SLA, and where does compound scaling break down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0860", "title": "Mastery: MobileNetV3 SE block impact on A17 Pro ANE vs Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should MobileNetV3 SE blocks be handled on A17 Pro ANE versus Snapdragon Hexagon, and what latency overheads matter?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0861", "title": "Optimize MobileNetV2 expansion ratio for Snapdragon 8 Gen 3 Hexagon HTP memory", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should MobileNetV2 expansion ratios be adjusted to reduce early activation memory pressure on Hexagon HTP?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0862", "title": "Optimize EfficientNet-Lite0 for continuous inference battery life on A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can EfficientNet-Lite0 continuous AR inference cut battery drain below 4% per hour without changing architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0863", "title": "Realization: ship MobileNetV3 model update OTA with Core ML compilation on device", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a MobileNetV3 Core ML OTA update be delivered, compiled, validated, and rolled back within the setup target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0864", "title": "Snapdragon 8 Gen 3 CNN Ensemble Scheduling", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a MobileNetV3-Small and EfficientNet-Lite0 ensemble be split across Snapdragon compute units under 8ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0865", "title": "Recall depthwise separable convolution parameter count for MobileNetV2 first block", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How would you calculate the exact parameter count and FLOPs for this block and explain why t=1 is used only in the first block?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0866", "title": "Specification: define mobile CNN requirements for real-time AR on A17 Pro", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What technical specification should define this real-time AR mobile CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0867", "title": "Specification: model format and versioning requirements for cross-platform mobile CNN deployment", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What packaging, versioning, and validation spec should govern cross-platform MobileNetV3 deployment to Core ML and QNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0868", "title": "Analyzing Multi-Model Pipeline Memory Pressure on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory pressure and identify which models can be hot-swapped vs kept resident to stay within the 4GB app memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0869", "title": "Analyzing Compound AI Pipeline Thermal Throttling on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the thermal contribution of each pipeline stage and determine which stage to optimize first to recover performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0870", "title": "Analyzing On-Device vs Cloud Offload Decision for Compound AI on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze under which conditions on-device vs cloud is preferred, considering battery, latency, and privacy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0871", "title": "Designing a 3.5GB On-Device RAG System for Personal Documents", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an on-device RAG system fit document search and LLM generation within memory and response-time constraints?", "chain_ids": ["mobile-chain-auto-secondary-003-01"], "chain_positions": {"mobile-chain-auto-secondary-003-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-003-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0872", "title": "Designing a Resilient Agent Fallback Architecture for Intermittent Connectivity on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a mobile compound AI agent switch gracefully between cloud and on-device modes during intermittent connectivity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0873", "title": "Diagnosing Slow First-Token Latency in On-Device RAG on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Tensor G3 on-device RAG have 4.2 s TTFT with a loaded 1B INT8 LLM and three 400-token documents?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0874", "title": "Diagnosing Context Overflow Failures in Multi-Turn Mobile Agents on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose when context overflow occurs and design a prevention strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0875", "title": "Diagnosing Retrieval Quality Degradation After App Update on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why retrieval quality degraded and quantify the impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0876", "title": "Evaluating Embedding Model Quality vs Size Tradeoff on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which model maximizes quality within a 200ms retrieval budget per session and a 100MB memory limit for the embedding model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0877", "title": "Evaluating Speculative Decoding Viability for On-Device LLM on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate whether speculative decoding provides a net speedup given the verification overhead on mobile hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0878", "title": "Evaluating End-to-End Compound Pipeline Quality on Tensor G3 with a RAG Evals Suite", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the gap between individual stage metrics and the 0.61 end-to-end satisfaction score?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0879", "title": "Explaining Compound AI Pipeline Stages to a Mobile App Developer on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why does a compound AI pipeline need retrieval instead of feeding Core ML image features directly to a 1B LLM?", "chain_ids": ["mobile-chain-auto-secondary-003-01"], "chain_positions": {"mobile-chain-auto-secondary-003-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-003-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0880", "title": "Explaining Model Routing Tradeoffs in Compound AI to a Product Manager", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can't we just always use the best, most powerful model for every request on a mobile device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0881", "title": "Explaining Token Budget Constraints for Compound AI on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why is the context window so much smaller on mobile, and why does it matter for compound AI?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0882", "title": "Implementing a Streaming Response Pipeline for Compound AI on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the compound AI pipeline stream tokens so the first response appears within 200ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0883", "title": "Implementing Background Index Sync for On-Device RAG on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the background sync architecture with power and latency constraints to prevent battery drain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0884", "title": "Mastering Compound AI System Reliability Engineering on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a reliability architecture that achieves a 99.5% successful response rate for this compound pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0885", "title": "Mastering End-to-End Optimization of a 4-Stage Compound Pipeline on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a comprehensive optimization strategy that achieves this target without changing models or hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0886", "title": "Mastering On-Device Knowledge Base Update Strategy for Compound AI on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a knowledge base update strategy that maintains retrieval quality, minimizes battery impact, and handles the case where updates arrive while the user is actively using the app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0887", "title": "Optimizing Retrieval Chunk Size for On-Device RAG on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the memory and latency impact and determine whether the quality improvement justifies the cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0888", "title": "Optimizing Prefill Latency via Prompt Compression on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the net latency improvement and decide whether prompt compression is worthwhile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0889", "title": "Optimizing Agent Tool-Call Frequency to Reduce Battery Drain on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can the Tensor G3 agent reduce tool-call energy by at least 40% without reducing task quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0890", "title": "Realizing a Compound AI App with CoreML Multi-Model Orchestration on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should CoreML load and schedule a three-model compound AI pipeline within a 600MB A17 Pro process budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0891", "title": "Realizing On-Device Agent State Persistence Across App Backgrounding on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should an Android agent persist lightweight state so it can resume within 500ms after backgrounding without saving the entire KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0892", "title": "Realizing a Hybrid On-Device and Cloud Compound AI Pipeline on Tensor G3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a hybrid compound AI pipeline decide between cloud and on-device generation while preserving context privacy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0893", "title": "Recalling Compound AI Pipeline Components and Their Roles on Mobile", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the standard components in an on-device RAG pipeline, and what does each one do?", "chain_ids": ["mobile-chain-auto-secondary-003-01"], "chain_positions": {"mobile-chain-auto-secondary-003-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-003-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0894", "title": "Specifying Latency SLO Contracts for Each Stage of a Mobile Compound Pipeline on A17 Pro", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What per-stage latency SLOs and error budgets should a five-stage pipeline use for a 1.5s TTFT SLA?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0895", "title": "Specifying Model Update and Rollback Contract for Compound AI on Snapdragon 8 Gen 3", "topic": "compound-ai-systems", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What OTA model update and rollback protocol can detect a bad compound AI model and revert within one hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0896", "title": "Dataset Curation: Design On-Device Data Collection for Mobile Model Training", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the data collection, quality filtering, and privacy-preserving local storage pipeline?", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 1}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0897", "title": "Dataset Curation: Evaluate On-Device vs Cloud Data Curation for Mobile ML", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should photo-enhancement data curation run on-device or in the cloud for privacy, latency, label quality, and cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0898", "title": "Dataset Curation: Evaluate Federated Data Heterogeneity Impact on Mobile Model Quality", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the impact of data heterogeneity on global model quality and propose a mitigation strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0899", "title": "Dataset Curation: Fluency — Token Budget for On-Device Fine-Tuning on A17 Pro", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you estimate the total fine-tuning compute, memory required, and time to complete?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0900", "title": "Federated DP-SGD Parameters for Device-Level Mobile Privacy", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should DP-SGD be implemented for federated gradients on mobile devices to achieve epsilon 1.0, and what are the required parameters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0901", "title": "Dataset Curation: Implement Data Versioning for Mobile Federated Learning", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should a federated learning system version-gate devices and data when model versions differ across the fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0902", "title": "Dataset Curation: Mastery — On-Device Personalization Data Strategy for 1B Mobile Users", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the complete on-device personalization data strategy (collection, privacy, local curation, federated aggregation, and global model updates)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0903", "title": "Dataset Curation: Mastery — Sensor Fusion Dataset Design for On-Device Health Model", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design data curation, synchronization, and privacy-preserving collection for the target device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0904", "title": "Dataset Curation: Optimize Data Pipeline for Federated Learning on Mobile", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How can the Tensor G3 federated learning preprocessing pipeline be optimized when preprocessing dominates local training time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0905", "title": "Dataset Curation: Optimize Label Quality vs Cost Tradeoff for Mobile App Training", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What tiered labeling strategy can meet a $10,000 budget and near-90% label accuracy for 500K mobile photos?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0906", "title": "Dataset Curation: Realize Training Data Requirements for On-Device LLM Fine-Tuning", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What data and LoRA fine-tuning plan lets a 1B email assistant fit and train on mobile constraints within five minutes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0907", "title": "Federated Learning Tokenizer and Schema Consistency Across Mobile Clients", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should cross-device federated learning enforce consistent tokenizers, data formats, and label schemas?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0908", "title": "Dataset Curation: Recall — What is Federated Learning Data Privacy?", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What does federated learning mean for mobile data privacy, and how does it differ from centralized data collection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0909", "title": "Dataset Curation: Specification — Mobile Data Quality SLA for RLHF Personalization", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What data quality SLAs should gate on-device RLHF personalization updates on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0910", "title": "Fault Tolerance: Analyze Checkpoint Overhead for On-Device Fine-Tuning on A17 Pro", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the expected number of fleet-wide failures during one 10-minute daily training window, and how does it impact checkpoint frequency?", "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 1}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0911", "title": "Fault Tolerance: Design Model Update Safety Protocol for Mobile LLM", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a fault-tolerant update protocol that limits the blast radius of a bad update to < 0.01% of total users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0912", "title": "Fault Tolerance: Evaluate On-Device vs Cloud Recovery for Mobile ML State", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the recovery time, user experience, and privacy implications of cloud recovery vs local rebuilding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0913", "title": "Fault Tolerance: Evaluate App Crash vs Model Corruption Recovery Paths", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you compare the recovery protocol, recovery time, and user impact for each failure mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0914", "title": "Fault Tolerance: Fluency — Mobile Checkpoint Write Speed and Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the checkpoint write time and energy overhead if saving every 60 seconds during a 10-minute background training session?", "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 0}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0915", "title": "Fault Tolerance: Implement Atomic Model Swap for Zero-Downtime Mobile Update", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should an atomic mobile model swap update a 500MB LLM without interrupting in-flight inference and still support rollback?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0916", "title": "Fault Tolerance: Implement Differential Checkpoint for Mobile LoRA Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should differential checkpointing reduce mobile LoRA checkpoint size while preserving recovery correctness?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0917", "title": "Fault Tolerance: Mastery — Mobile Personalization Reliability Architecture", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "Design a complete fault tolerance architecture that preserves personalization state across all these events?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0918", "title": "Fault Tolerance: Mastery — Systematic Failure Mode Analysis for Mobile ML", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the top 5 failure modes, their probability, impact, and mitigations with quantitative analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0919", "title": "Fault Tolerance: Optimize Recovery Speed for Frequently Crashing Mobile Training", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should the job recover from thermal crashes in under 30 seconds and reduce future crashes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0920", "title": "Fault Tolerance: Optimize Checkpoint Storage Budget for Low-Storage Mobile Devices", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you reduce checkpoint storage below 50 MB while preserving fault tolerance on low-storage A17 Pro devices?", "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 2}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0921", "title": "On-Device Training Checkpointing for Mobile Personalization", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify storage layout, checkpoint schedule, retention policy, and recovery paths for an on-device training system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0922", "title": "Fault Tolerance: Realize Mobile Fleet Update Rollout Timeline", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify staging percentages, validation gates, CDN requirements, and rollback procedures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0923", "title": "Fault Tolerance: Recall — What is an A/B Model Partition?", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is an A/B model partition strategy, and why does it improve fault tolerance for on-device model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0924", "title": "Fault Tolerance: Specification — Reliability SLA for Mobile Payment ML Model", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What reliability SLA should govern an on-device payment fraud model, including RTO, RPO, fail-safe behavior, and checkpoints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0925", "title": "Operator Fusion on Mobile Neural Engines", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is operator fusion on the A17 Pro Neural Engine, and which transformer operations are commonly fused?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0926", "title": "Kernel Fusion: Design Fusion Strategy for On-Device LLM on Snapdragon 8 Gen 3", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What operator fusion and execution-placement graph would you use for a 1B 4-bit GQA LLM on Snapdragon 8 Gen 3 to target 20 tokens/sec?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0927", "title": "Kernel Fusion: Evaluate Core ML vs ONNX Runtime Fusion for iPhone LLM", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate Core ML vs ONNX Runtime on kernel fusion depth, tokenizer throughput, first-token latency, and subsequent-token latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0928", "title": "Kernel Fusion: Evaluate Quantized Attention Fusion on A17 Pro", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do FP16 unfused attention and INT8 fused attention compare for memory, latency, and accuracy?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0929", "title": "Kernel Fusion: Fluency — Estimate Neural Engine Throughput for Fused MLP", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the estimated Neural Engine throughput for the fused MLP block, and which phase is the bottleneck?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0930", "title": "Kernel Fusion: Implement Fused Conv+BN+ReLU for Mobile CNN on Snapdragon", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify the fusion implementation, verify correctness, and quantify memory bandwidth savings?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0931", "title": "Flash Attention SRAM Tiling for Mobile NPU LLM Inference", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify tile sizes for flash attention given available SRAM, and compute memory savings vs standard attention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0932", "title": "Kernel Fusion: Mastery — Optimize 3B LLM Decode Throughput on A17 Pro", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which optimizations move a 3B INT4 LLM decode on A17 Pro from 18 to 30 tokens/sec, and by how much?", "chain_ids": ["mobile-chain-auto-secondary-007-14"], "chain_positions": {"mobile-chain-auto-secondary-007-14": 3}, "chain_tiers": {"mobile-chain-auto-secondary-007-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0933", "title": "Kernel Fusion: Mastery — Fusion Strategy Across Heterogeneous Mobile Hardware", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a single unified model with hardware-adaptive fusion strategy that achieves 20 tokens/sec on all three platforms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0934", "title": "Kernel Fusion: Optimize SwiGLU Fusion for Mobile LLM on Tensor G3", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is an 8ms SwiGLU activation unrealistic, and what fusion fix should remove the overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0935", "title": "Kernel Fusion: Optimize Attention Fusion for Long-Context Mobile LLM", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you analyze feasibility and design an optimized attention fusion strategy for a 4096-token context on an 8GB device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0936", "title": "Kernel Fusion: Realize Fusion Plan for Production Mobile LLM Deployment", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify all fused op groups, their implementations, testing requirements, and expected latency for each platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0937", "title": "Kernel Fusion: Realize Bandwidth Savings from Fusing Transformer Block on Snapdragon", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much memory bandwidth does fusing post-attention operations save for a 1B decode block on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0938", "title": "Kernel Fusion: Recall — Arithmetic Intensity and Mobile Fusion Impact", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of a ReLU on a 4096-element FP16 tensor, and how much latency does kernel fusion save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0939", "title": "Kernel Fusion: Define Fusion Requirements for Cross-Platform SDK", "topic": "kernel-fusion", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What fusion requirements and platform adaptation rules should a cross-platform mobile LLM SDK specify to reach 20 tokens per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0940", "title": "KV-Cache: Analyze KV Cache Memory Pressure on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why KV cache growth causes this degradation and compute the expected latency scaling?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0941", "title": "KV-Cache: Design KV Cache Management for On-Device LLM Chat App", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the KV cache management system for this application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0942", "title": "KV-Cache: Design Multi-Turn KV Cache Persistence for Mobile App Switching", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a KV cache persistence strategy that allows resuming conversation within 3 seconds of app foreground?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0943", "title": "KV-Cache: Diagnose KV Cache Overflow Causing Mobile App Crashes", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Tensor G3 3B LLM app crash near 3000 tokens, and how should the KV cache be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0944", "title": "KV-Cache: Diagnose KV Cache Thrashing in Concurrent Mobile LLM Requests", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does A17 Pro foreground chat slow 3x when a 1B background summarizer holds a 2048-token KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0945", "title": "KV-Cache: Diagnose Attention Head Selection Causing Memory Inefficiency", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the current memory inefficiency and quantify the GQA improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0946", "title": "KV-Cache: Evaluate KV Cache Quantization Strategies on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do FP16, INT8, and INT4 KV caches trade memory, bandwidth, quality, and complexity for a 1B LLM?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0947", "title": "KV-Cache: Evaluate Sliding Window vs Full KV Cache for Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate memory, quality, and latency over a 50-turn, 3000-token conversation for these two KV cache strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0948", "title": "Compare MHA vs GQA KV-Cache Memory on Mobile LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare MHA with 16 KV heads versus GQA with 4 KV heads for memory, bandwidth, quality, and architecture impact?", "chain_ids": ["mobile-chain-bucket-kvcachem-06"], "chain_positions": {"mobile-chain-bucket-kvcachem-06": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0949", "title": "KV-Cache: Fluency — KV Cache Size Estimation for 1B Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a 4096-token conversation feasible in the available memory on this device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0950", "title": "KV-Cache: Fluency — Decode Throughput vs Context Length Trade-off on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does KV cache bandwidth degradation affect decode throughput across 256, 1024, and 4096 context tokens?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0951", "title": "KV-Cache: Fluency — GQA Memory Savings Calculation for Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do MHA, GQA, and MQA KV-cache sizes affect maximum context for a 3B INT4 LLM?", "chain_ids": ["mobile-chain-bucket-kvcachem-06"], "chain_positions": {"mobile-chain-bucket-kvcachem-06": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0952", "title": "KV-Cache: Implement KV Cache Size Limiter for Mobile LLM Memory Safety", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a 1B LLM cap KV cache at an 80% memory threshold while preserving key tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0953", "title": "KV-Cache: Mastery — KV Cache Co-Design for Mobile LLM Architecture", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you co-design the model architecture and KV cache management policy to meet the target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0954", "title": "Efficient KV Cache Strategy for a 32K-Token Mobile Code Assistant", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a memory-efficient KV cache, eviction policy, and quality preservation strategy for a 32K context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0955", "title": "KV-Cache: Optimize KV Cache for Aggressive Memory Compression on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should KV cache compression and offloading free memory for longer contexts on a 3B A17 Pro LLM?", "chain_ids": ["mobile-chain-bucket-kvcachem-04"], "chain_positions": {"mobile-chain-bucket-kvcachem-04": 3}, "chain_tiers": {"mobile-chain-bucket-kvcachem-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0956", "title": "KV-Cache: Optimize KV Pruning for Token Budget on Snapdragon 8 Gen 3", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For an 8192-token legal document on a 2048-token KV budget, should Snapdragon 8 Gen 3 use stride pruning or H2O?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0957", "title": "KV-Cache Memory Layout for a 3B LLM on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you specify tensor dimensions, memory addresses, data types, and total allocation for the KV cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0958", "title": "KV-Cache: Realize KV Cache Reuse for System Prompt on Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you cache the system prompt KV, and what is the quantified TTFT improvement for a 512-token user message?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0959", "title": "KV-Cache: Recall — What is a KV Cache and Why Does It Matter for Mobile LLM?", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a KV cache, why does mobile LLM decoding need it, and how does generation compute scale without it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0960", "title": "Production Mobile LLM KV Cache Requirements on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What KV cache memory, quantization, eviction, and recovery requirements should a production A17 Pro chat app specify?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0961", "title": "KV-Cache: Specification — KV Cache Budget for Multi-Turn Code Completion", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify the KV cache budget, quantization, layout, and context management for this code completion assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0962", "title": "KV-Cache: Specification — KV Cache Constraints for Latency SLA on Tensor G3", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Under this bandwidth-bound decode model, what maximum KV cache size, required KV quantization, and context limit should be specified to meet the 30 tokens/sec SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0963", "title": "Latency Decomposition: Compare On-Device vs. Cloud LLM Latency on A17 Pro", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For a 1B language model, which deployment approach wins TTFT and TPOT for 50 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0964", "title": "Latency Decomposition: Compare CoreML vs. TFLite Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare inference latency between Android NNAPI and iOS CoreML including framework overhead and memory transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0965", "title": "Latency Decomposition: Compute TTFT and TPOT for On-Device LLM on Snapdragon 8 Gen 3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you compute TTFT for a 256-token prompt and TPOT for decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0966", "title": "Latency Decomposition: Full Mobile ML App Latency Audit with User-Perceived Delay", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the tap-to-preview latency breakdown for the style transfer app, and which component should be optimized first?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0967", "title": "Latency Decomposition: Optimize Wake-Word Detection Latency on Tensor G3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you cut wake-word P99 latency from 180 ms to under 80 ms for a 1M-param INT8 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0968", "title": "Latency Decomposition: Size Inference Latency Budget for AR App on A17 Pro", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What latency budget does each AR segmentation pipeline stage consume, and how much 60 FPS slack remains?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0969", "title": "MLOps Lifecycle: Compare App Store vs. Dynamic Model Updates for iOS ML Apps", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do App Store bundled models and dynamic post-install downloads compare regarding latency, compliance, and bandwidth cost for 10M users?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0970", "title": "MLOps Lifecycle: Compare Federated Learning vs. Centralized Training for Mobile", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you quantify the gradient upload bandwidth, privacy implications, and model accuracy tradeoffs between federated and centralized training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0971", "title": "MLOps Lifecycle: End-to-End Mobile ML Platform Design for Consumer App", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What storage, bandwidth, and engineering overhead come from 10 variants of a 30 MB photo model for 50M users?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0972", "title": "Federated Mobile Health Personalization Lifecycle for 5M Devices", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the full lifecycle including on-device training, privacy-preserving aggregation, global model update, and OTA distribution to 5M devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0973", "title": "MLOps Lifecycle: Optimize Mobile App CI/CD Pipeline for ML Model Updates", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the most effective way to optimize the non-training stages of the CI/CD pipeline to drastically reduce the total time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0974", "title": "MLOps Lifecycle: Size OTA Update Bandwidth for 100M Mobile App Users", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What monthly OTA model update bandwidth and CDN cost result for 100M users, and how much can delta updates save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0975", "title": "MLOps Lifecycle: Size Model Registry Storage for Mobile Segment Variants", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much registry storage and monthly S3 cost are needed for the four mobile model tiers and retained versions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0976", "title": "MLOps Lifecycle: Specify A/B Test Framework for On-Device Model Comparison", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify an A/B testing framework for comparing these models, defining sample size, assignment, metrics, and decision criteria?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0977", "title": "Model Format Conversion: Choose CoreML Path for TensorFlow MobileNetV3 on iOS", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For the iOS artifact, should the team convert through TFLite-to-CoreML or export through ONNX-to-CoreML, and what are the performance and maintenance tradeoffs?", "chain_ids": ["mobile-chain-auto-001-07"], "chain_positions": {"mobile-chain-auto-001-07": 0}, "chain_tiers": {"mobile-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0978", "title": "Model Format Conversion: Compare INT8 vs. INT4 CoreML Quantization on A17 Pro", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Under that bandwidth-bound decode assumption, how would you compare model size, per-token inference latency, 50-token decode latency, KV-cache headroom, and accuracy for INT8 versus INT4?", "chain_ids": ["mobile-chain-auto-001-12"], "chain_positions": {"mobile-chain-auto-001-12": 1}, "chain_tiers": {"mobile-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0979", "title": "Model Format Conversion: Implement ONNX→CoreML Conversion with Numerical Validation", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What steps, validation metrics, expected size, and latency should be used when converting MobileNetV3-Small from ONNX to CoreML?", "chain_ids": ["mobile-chain-auto-001-11"], "chain_positions": {"mobile-chain-auto-001-11": 1}, "chain_tiers": {"mobile-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0980", "title": "Model Format Conversion: Full Stack LLM Conversion for On-Device iOS Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you evaluate conversion time, model size, TTFT, TPOT, and maximum context length for a 1B model in 8GB RAM?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 4}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0981", "title": "Multi-Platform LLM Model Format Strategy for iOS and Android", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the full multi-platform model format strategy for size and latency?", "chain_ids": ["mobile-chain-auto-001-07"], "chain_positions": {"mobile-chain-auto-001-07": 2}, "chain_tiers": {"mobile-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0982", "title": "TFLite Mobile NPU CPU Fallback and Graph Fragmentation", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an EfficientNet-Lite0 TFLite model converted through the SDK show 45 ms latency instead of the 8 ms target?", "chain_ids": ["mobile-chain-auto-001-03"], "chain_positions": {"mobile-chain-auto-001-03": 1}, "chain_tiers": {"mobile-chain-auto-001-03": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0983", "title": "Model Format Conversion: Size ONNX vs. TFLite Model Storage for Mobile App", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How large are the bundled ONNX FP32 and TFLite INT8 model variants, and do they fit under the 200MB app limit?", "chain_ids": ["mobile-chain-auto-001-11"], "chain_positions": {"mobile-chain-auto-001-11": 0}, "chain_tiers": {"mobile-chain-auto-001-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0984", "title": "Model Format Conversion: Recall ONNX Opset Compatibility for Mobile Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the recommended ONNX opset version for converting PyTorch models to CoreML, and why does opset version matter?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 0}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0985", "title": "Model Format Conversion: Specify Multi-Platform Model Conversion CI/CD Requirements", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a production CI/CD pipeline for converting and validating PyTorch models for simultaneous iOS and Android deployment?", "chain_ids": ["mobile-chain-auto-001-07"], "chain_positions": {"mobile-chain-auto-001-07": 1}, "chain_tiers": {"mobile-chain-auto-001-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0986", "title": "Model Size Estimation: Analyze Why Mobile LLM Needs INT4 Despite 8GB RAM", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 3B parameter model require INT4 quantization to run on an 8GB iPhone 16 Pro?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0987", "title": "Model Size Estimation: Design On-Device ML Memory Architecture for Dual-Model iPhone App", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the app keep preview and high-quality portrait models in memory while meeting 60 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0988", "title": "3B LLM Memory Budgeting on 16GB Android Devices", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a mobile device allocate precision, KV cache, and context length for a 3B LLM in 16 GB RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0989", "title": "Model Size Estimation: Compare 1B vs. 3B LLM Memory on A17 Pro", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Using the INT4 sizes, memory, KV dims, and decode TPOT below, how do model size, KV capacity at 4096 ctx, decode latency, and max retained context before overflow compare?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0990", "title": "Model Size Estimation: Compare INT4 vs. INT8 Memory for Mobile LLM on Snapdragon", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare memory footprint, TPOT, maximum batch size, and accuracy implications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0991", "title": "Model Size Estimation: Fluency — Size Mobile LLM Memory in 60 Seconds", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much memory does a 1B INT4 model use on a 16 GB phone, and what remains for KV cache?", "chain_ids": ["mobile-chain-auto-027-12"], "chain_positions": {"mobile-chain-auto-027-12": 0}, "chain_tiers": {"mobile-chain-auto-027-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0992", "title": "Model Size Estimation: Implement Parameter Count for Custom Transformer on Mobile", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many parameters and INT8 bytes does a 6-layer 512-hidden transformer with a 32K vocab require, including embeddings, attention, and FFN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0993", "title": "On-Device LLM Memory Audit for iOS KV Cache and Jetsam Limits", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you verify the system stays under the 5GB practical ceiling?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 3}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0994", "title": "Multimodal Mobile LLM Memory Layout and TPOT Estimate", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compute full memory layout and TPOT for text generation after image encoding?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0995", "title": "Model Size Estimation: Diagnose Mobile OOM from KV-Cache Growth", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of the OOM crashes and quantify a fix?", "chain_ids": ["mobile-chain-auto-027-12"], "chain_positions": {"mobile-chain-auto-027-12": 1}, "chain_tiers": {"mobile-chain-auto-027-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0996", "title": "Hypothetical Snapdragon 8 Gen 3 LLM TPOT Slowdown from Memory Pressure", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a 7B INT4 LLM on Snapdragon 8 Gen 3 slow from 45ms to 180ms TPOT after 5 minutes, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0997", "title": "Model Size Estimation: Realize Full Memory Layout for 3B LLM on Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 3B INT4 LLM plus 4096-token KV cache fit on Snapdragon 8 Gen 3, and what TPOT follows from 77 GB/s?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 0}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0998", "title": "1B INT4 LLM Memory and TPOT Comparison Across Mobile SoCs", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do A17 Pro, Snapdragon 8 Gen 3, and Tensor G3 compare on memory budget, concurrent KV-cache users, and TPOT for a 1B INT4 LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-0999", "title": "Snapdragon On-Device LLM Size Under Memory and TPOT Constraints", "topic": "model-size-estimation", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you choose the model family, precision, max parameter count, and accuracy validation plan for these constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1000", "title": "Shared LPDDR5X Bandwidth Bottleneck for LLM and 4K Video Encode", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much of a 77 GB/s Snapdragon memory bus does a 3B INT4 LLM consume alongside 4K H.265 encoding, and how does it degrade TPOT?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 0}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1001", "title": "A17 Pro Sustained Inference Bandwidth Throttling from TPOT", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Under a weight-streaming approximation, what does a 7.4ms to 14ms TPOT jump reveal about sustained bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1002", "title": "LPDDR5X Bandwidth Scheduling for Mobile Multi-Model Inference", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a scheduler prevent memory bandwidth saturation across concurrent speech, LLM, and depth tasks?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1003", "title": "3B INT4 On-Device Decode Bandwidth Pipeline for Sub-30ms TPOT", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a bandwidth-efficient decode pipeline for a 3B INT4 LLM targeting TPOT < 30ms by leveraging weight tiling, prefetching, and KV-cache layouts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1004", "title": "Network Bandwidth Bottlenecks: Diagnose LPDDR5X Bandwidth Saturation on Snapdragon", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do LLM TPOT and camera inference both hit 60% throughput when memory bus use reaches saturation, and how do you fix it?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 1}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1005", "title": "Unified Memory and Metal GPU Contention Causing Mobile LLM TPOT Regression", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can a 60 FPS Metal animation explain a 1B INT4 LLM TPOT jump from 7.4ms to 11ms on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1006", "title": "Network Bandwidth Bottlenecks: Evaluate On-Device vs. Cloud for Bandwidth-Constrained Mobile Users", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do end-to-end latencies for on-device and cloud LLM inference compare across 5G and LTE network conditions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1007", "title": "Network Bandwidth Bottlenecks: Evaluate Speculative Decoding Bandwidth Impact on Mobile", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "On Snapdragon 8 Gen 3, does a 100M draft plus 3B INT4 target improve TPOT after bandwidth costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1008", "title": "RTT Crossover for On-Device Versus Cloud Mobile LLM Latency", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which path is faster for the listed networks, and what RTT crossover makes cloud faster for a 128-token generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1009", "title": "Network Bandwidth Bottlenecks: Fluency — Mobile Bandwidth Math in 60 Seconds", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the decode-bound TPOT and what's the memory bandwidth utilization during a single-user chat session?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1010", "title": "A17 Pro LLM Decode Arithmetic Intensity and Memory-Bandwidth TPOT", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you compute arithmetic intensity, peak FLOPS-bound TPOT, peak BW-bound TPOT, and determine which bottleneck applies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1011", "title": "Mobile AI Memory Bandwidth Conflicts on Snapdragon 8 Gen 3", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What bandwidth conflicts arise when these workloads run together, and what scheduler changes are needed?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 3}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1012", "title": "Mobile SoC Roofline Analysis for INT4 LLM Decode", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "If each sequence streams its own 3B INT4 weights from DRAM, does batching shift the workload from bandwidth-bound to compute-bound, and what does that mean for mobile serving?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 4}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1013", "title": "A17 Pro INT4 Quantization for 3B LLM TPOT", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which quantization strategy gets a 3B FP16 LLM on A17 Pro from 88 ms TPOT to under 30 ms?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 3}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1014", "title": "Speculative Decoding TPOT for a 7B INT4 LLM on Snapdragon", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 200M draft model make a 7B INT4 LLM meet a sub-20ms TPOT target on Snapdragon 8 Gen 3, and how does it compare to reducing model size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1015", "title": "Mobile Camera Pipeline Memory Bandwidth Optimization on A17 Pro", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What memory bandwidth components explain the 15GB/s camera pipeline load, and which optimizations reduce it below 8GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1016", "title": "LPDDR5X Bandwidth Budget for a Snapdragon 8 Gen 3 Mobile AI Stack", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What sustained average LPDDR5X bandwidth does the mobile AI stack consume, and does it fit under 77GB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1017", "title": "Realize Peak Bandwidth Utilization During On-Device LLM Response", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What peak and average LPDDR5X bandwidth does a 50-token 3B INT4 LLM response require on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1018", "title": "Mobile LLM Memory Bandwidth QoS for TPOT SLA", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What bandwidth reservations, priorities, throttles, and fallbacks are needed to keep mobile LLM TPOT under 25ms P95?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1019", "title": "Multi-Model Mobile Bandwidth Admission Control", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a complete QoS framework to manage 77 GB/s bandwidth across 5 simultaneous ML models?", "chain_ids": ["mobile-chain-auto-022-09"], "chain_positions": {"mobile-chain-auto-022-09": 2}, "chain_tiers": {"mobile-chain-auto-022-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1020", "title": "LPDDR5X Bandwidth on Snapdragon 8 Gen 3 vs Tensor G3", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the memory bandwidth of LPDDR5X on Snapdragon 8 Gen 3, how does it compare to LPDDR5 on Tensor G3, and why does this matter for LLM inference?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 0}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1021", "title": "Network Bandwidth Bottlenecks: Specification for Minimum Bandwidth to Support On-Device LLM", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive the memory bandwidth spec from first principles and explain how it constrains SoC design choices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1022", "title": "Model Format Conversion: Mobile Fluency — CoreML Model Conversion Pipeline in 60 Seconds", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the 4 steps to convert a PyTorch MobileNet model to CoreML, and what is the expected size of a 5M parameter model in FP16?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 2}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1023", "title": "Model Format Conversion: Optimize INT8 to INT4 Conversion for Android LLM", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose which quantization approach to use and quantify the accuracy-size tradeoff?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1024", "title": "Cross-Platform Model CDN Rollout Cost", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the CDN delivery cost for 50M total installs of multi-format models, and how much does delta updating save?", "chain_ids": ["mobile-chain-auto-001-05"], "chain_positions": {"mobile-chain-auto-001-05": 2}, "chain_tiers": {"mobile-chain-auto-001-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1025", "title": "Network Bandwidth Bottlenecks: Design Offline-First Bandwidth Architecture for Rural Mobile LLM", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify: on-device model configuration, model download strategy, graceful degradation for different connectivity levels, and cloud fallback criteria?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1026", "title": "Mobile LLM Memory Bandwidth Contention from Background Inference", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What would you identify as the root cause using bandwidth analysis and quantify the fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1027", "title": "Network Bandwidth Bottlenecks: Fluency — LPDDR5X BW and TPOT Math in 30 Seconds", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the TPOT and what fraction of the memory bus does it saturate during decode?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 1}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1028", "title": "Network Bandwidth Bottlenecks: Fluency — Speculative Decode BW Arithmetic", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the effective TPOT assuming an 80% acceptance rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1029", "title": "Memory-Bandwidth-Bound TPOT for Mobile LLM Decode", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you derive the memory-bandwidth-bound TPOT formula, and what is the latency for a 3B INT4 model on a device with 77 GB/s bandwidth?", "chain_ids": ["mobile-chain-auto-022-08"], "chain_positions": {"mobile-chain-auto-022-08": 2}, "chain_tiers": {"mobile-chain-auto-022-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1030", "title": "Network Bandwidth Bottlenecks: Implement KV-Cache Bandwidth Impact at Long Contexts", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you compute TPOT at context lengths of 512, 2048, 4096, and 8192 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1031", "title": "Flash-Attention Memory Bandwidth Savings for LLM Prefill on Mobile SoCs", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the memory bandwidth savings from using Flash-Attention vs. standard attention for this 3B LLM prefill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1032", "title": "Network Bandwidth Bottlenecks: Master Memory Bus Analysis for Multi-LLM Mobile App", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you analyze whether both can run simultaneously without bandwidth saturation, and specify a token-interleaved schedule?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1033", "title": "Mobile TCO Realization: Size Peak Bandwidth for Mobile LLM App", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "For Snapdragon 8 Gen 3 at 77 GB/s, what are peak and sustained bandwidth for a 3B INT4 LLM over 10 turns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1034", "title": "Network Bandwidth Bottlenecks: Specification — Minimum BW for 10-Turn Conversation SLA", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive the worst-case bandwidth requirement and compare against LPDDR5 (51.2 GB/s) vs LPDDR5X (77 GB/s)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1035", "title": "Hybrid 5G Routing for Mobile LLM Inference", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What routing rules should choose between cloud and on-device LLM inference under 5G latency, privacy, and budget constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1036", "title": "A17 Pro Voice Assistant LPDDR5X Bandwidth", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the peak LPDDR5X bandwidth requirements for each stage of the A17 Pro voice assistant pipeline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1037", "title": "Mobile TCO Analyze: A17 Pro vs Cloud for On-Device Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the annual battery degradation cost, cloud cost, and economic break-even point for on-device inference?", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1038", "title": "Mobile TCO Design: 8GB vs 16GB NPU Inference Economics", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which mobile platform has the better two-year TCO and capability profile for a 10,000-device enterprise LLM deployment?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1039", "title": "Mobile TCO Design: On-Device vs Hybrid Inference Cost for Consumer App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the baseline monthly cloud cost, and what monthly and annual savings result from raising the on-device fraction to 85%?", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1040", "title": "Mobile TCO Evaluation: A17 Pro vs Tensor G3 for On-Device ML App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate per-transcription energy cost for a 15-minute real-time audio transcription, and determine model compatibility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1041", "title": "Mobile TCO Evaluation: On-Device vs Cloud for Photo Enhancement App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For SD v1.5 at 5 images/day, which is cheaper over two years: on-device or a $0.01 cloud API, and where is break-even?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1042", "title": "Mobile TCO Fluency: Quick Battery Cost Estimation for Mobile ML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What annual energy cost and battery-cycle impact do the mobile ML workloads create?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1043", "title": "Mobile TCO Implement: Calculate Cost Per Inference for Mobile App", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the inference time, energy per inference, annual energy cost per user, and cost if monetizing electricity as a service (at $0.10/kWh markup)?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1044", "title": "Mobile TCO Mastery: Enterprise Mobile AI Strategy Full Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How does the 3-year TCO of an on-device enterprise deployment compare to a cloud API at $0.001/1K tokens?", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1045", "title": "Mobile TCO Mastery: On-Device AI ROI for Mobile App Monetization", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the 2-year NPV and ROI for building on-device LLM capabilities?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1046", "title": "Mobile TCO Optimization: Reduce Battery Drain for Intensive Mobile ML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you quantify the TCO impact for 1M daily active users in electricity terms while reducing battery drain?", "chain_ids": ["mobile-chain-auto-secondary-013-19"], "chain_positions": {"mobile-chain-auto-secondary-013-19": 4}, "chain_tiers": {"mobile-chain-auto-secondary-013-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1047", "title": "Mobile Music Generation Two-Year TCO", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does the scoped 2-year TCO of on-device music generation compare to cloud generation at $0.05 per song?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1048", "title": "Mobile TCO Realization: Memory Cost of On-Device Model Hosting", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can these three models fit in the A17 Pro's 8GB RAM budget, and what is their effective storage cost on a 256GB ($1199) device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1049", "title": "Mobile SoC Limits for On-Device LLMs", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What TOPS, RAM, power, and maximum INT4 and FP16 LLM sizes characterize modern mobile SoC platforms?", "chain_ids": ["mobile-chain-auto-secondary-013-20"], "chain_positions": {"mobile-chain-auto-secondary-013-20": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1050", "title": "Mobile TCO Specification: Design Mobile-First AI Product Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify the on-device vs cloud split, annual revenue, cost breakdown, and gross margin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1051", "title": "Mobile Transformer Cost Evaluation: On-Device LLM Viability on A17 Pro", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can Llama-3-8B INT4 meet a P99 under 3 seconds for 100-token responses on the A17 Pro, given RAM and bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1052", "title": "Mobile Transformer Cost Evaluation: Snapdragon vs A17 Pro LLM Performance", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate decode throughput (tokens/second), latency for 200-token response, and which platform better utilizes its memory advantage for LLM decode?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 2}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1053", "title": "Mobile Transformer Cost Fluency: Quick LLM Sizing for Mobile", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the quick estimates for maximum INT4 LLM size, 3B decode speed, and 1024-token prefill cost on an 8GB device with 100GB/s bandwidth?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 1}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1054", "title": "Mobile Transformer Cost Implement: Calculate Token Budget for Mobile LLM", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate model memory, KV cache per token, max context tokens given RAM constraints, and verify the throughput SLO?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1055", "title": "Mobile Transformer Cost Mastery: Full On-Device LLM Product Design", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you choose model size and quantization for each task and calculate memory allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1056", "title": "Mobile LLM Pareto Frontier on Snapdragon 8 Gen 3: Latency vs Perplexity", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What would you identify as the Pareto-optimal model for a voice assistant (SLO: P99 < 3s, quality: perplexity < 12)?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 4}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1057", "title": "Mobile Transformer Cost Optimization: Quantization + Speculative for Mobile LLM", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which optimization achieves P99 < 3s with minimum quality loss?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 3}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1058", "title": "Mobile Transformer Cost Optimization: On-Device vs Streaming Hybrid", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an optimal hybrid policy that minimizes cost while maintaining P99<3s across network conditions, and what is the cost per 1000 daily queries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1059", "title": "Speculative Decoding Memory Fit on Snapdragon 8 Gen 3: 1B Draft and 7B INT4 Target", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is there room for a 7B INT4 target model alongside the draft model for speculative decoding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1060", "title": "Mobile Transformer Cost Recall: Mobile LLM Memory Bandwidth Facts", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the memory bandwidth for these SoCs, and what is the decode tokens/second for a 3B INT4 model on each?", "chain_ids": ["mobile-chain-auto-018-04"], "chain_positions": {"mobile-chain-auto-018-04": 0}, "chain_tiers": {"mobile-chain-auto-018-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1061", "title": "On-Device Multi-Task Model Memory Budgeting", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which model architecture and memory allocation allows keyword spotting, chat, and summarization to meet their SLOs simultaneously?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1062", "title": "Diagnose Thermal Throttling Cost Impact on A17 Pro Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the cost impact of throttling on user experience and propose thermal-aware scheduling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1063", "title": "Analyze Transformer Token Budget on Snapdragon 8 Gen 3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory allocation for supporting 10 concurrent conversations, each with its own 4K context window, given 32 layers, 8 KV heads, and a 64 head dimension?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1064", "title": "Design On-Device LLM Architecture for Tensor G3 Within Thermal Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What LLM architecture fits under the 3W power, 500ms TTFT, and 50ms/token decode constraints on the Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1065", "title": "Design Prompt Caching Strategy for Mobile LLM on A17 Pro", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What memory cost and TTFT savings come from caching a 512-token system-prompt KV cache across requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1066", "title": "Diagnose Attention Softmax Precision Loss on Mobile NPU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do INT8 attention logits on a mobile NPU degrade the LLM from 89% at 128 tokens to 61% at 512 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1067", "title": "Implement Mobile LLM Decode Benchmark on Tensor G3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a decode benchmark isolate inference time and compare measured tokens per second to the bandwidth limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1068", "title": "Llama 3.2 1B KV-Cache Sizing for A17 Pro NPU Deployment", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the correct KV-cache size for the Llama 3.2 1B deployment, and why is the larger estimate wrong?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1070", "title": "Speculative Decoding Feasibility on Mobile NPU", "topic": "speculative-decoding", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does speculative decoding impact on-device latency given the NPU cannot run two models concurrently and must swap them in shared memory?", "chain_ids": ["mobile-chain-auto-secondary-017-33"], "chain_positions": {"mobile-chain-auto-secondary-017-33": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1071", "title": "Shared Early-Exit Speculative Decoding", "topic": "speculative-decoding", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a shared-weight speculation system and analyze its memory savings compared to a separate 0.5B draft?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1072", "title": "On-Device KV-Cache Budget for Mobile LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What KV-cache context length fits a 3B INT4 LLM on A17 Pro when only 2GB remains for KV storage?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 3}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1073", "title": "KV-Cache Persistence Across App Sessions on Mobile", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a KV-cache persistence strategy that avoids re-prefilling tokens when the app is relaunched?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1074", "title": "KV-Cache Quantization for On-Device LLM", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare FP16 vs INT8 vs INT4 KV-cache strategies for maximum concurrent conversations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1075", "title": "KV-Cache Impact on Mobile Thermal Throttling", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much of a 3B LLM slowdown from 15 tok/s at token 200 to 6 tok/s at token 800 is KV growth versus heat?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1076", "title": "KV-Cache Layout for NPU Acceleration", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the optimal memory layout for the ANE, and why does innermost dimension alignment matter?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1077", "title": "Shadow Deployment Frame Drops on Exynos NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does shadow deployment on the device drop camera frame rate despite low TOPS utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1078", "title": "Tensor G3 CPU vs TPU Decoding Bottlenecks", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the highly efficient TPU not provide a proportional speedup for decoding, and what is its theoretical compute limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1079", "title": "Adversarial Sparsity Loss on Exynos 2400 NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do adversarial camera inputs cause cross-component interference on Exynos 2400 despite fixed model FLOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1080", "title": "Analyzing GQA vs MHA Memory Bandwidth on Hexagon NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this architectural change provide such a significant speedup on this specific hardware?", "chain_ids": ["mobile-chain-auto-secondary-010-10"], "chain_positions": {"mobile-chain-auto-secondary-010-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1081", "title": "Depthwise Convolutions on Tensor G3 TPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you analyze why the latency reduction does not scale linearly with the FLOP reduction on this specific hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1082", "title": "Mobile NPU Inference Cost Analysis on Snapdragon 8 Gen 3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Snapdragon 8 Gen 3 theoretical 45 TOPS metric not perfectly translate to the observed frame rate?", "chain_ids": ["mobile-chain-auto-secondary-004-08"], "chain_positions": {"mobile-chain-auto-secondary-004-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1083", "title": "On-Device Coreset Selection and NPU Memory Contention", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does increasing an Exynos 2400 coreset from 500 to 2000 samples drop NPU utilization below 5%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1084", "title": "A17 Pro Unified Memory Contention in Video Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does this data pipeline bottleneck the system despite the available compute overhead?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 1}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1085", "title": "On-Device Data Quality Bandwidth Contention", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does enabling a Tensor G3 anomaly-detection gate cut Gemini Nano token generation despite only 10% TPU use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1086", "title": "Dataset Padding Impact on NPU Utilization", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does feeding a 3840x3840 padded image degrade the NPU's effective TOPS and shift it to a memory-bound state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1087", "title": "Mobile DP-SGD Per-Sample Gradient Memory Wall", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does increasing the DP-SGD batch size from 32 to 256 cause memory failure and thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1088", "title": "On-Device PSI Drift Compute Bottleneck", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you analyze why this specific statistical detection workflow degrades system reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1089", "title": "Analyze Encoder vs Decoder Latency on Snapdragon NPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 1B encoder run faster per token than a 1B decoder on a mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1090", "title": "Energy Analysis of Memory Access in LLM Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does battery drain come primarily from memory weight reads during Gemini Nano token generation instead of TPU compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1091", "title": "Analyzing 3-bit AWQ Overhead on Mobile TPUs", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does 3-bit AWQ quantization cause a severe compute bottleneck on the mobile TPU architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1092", "title": "NPU Frame Dropping Impact on Equalized Odds", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this hardware-driven throttling degrade the Equalized Odds fairness metric specifically for Subgroup A, and what is the True Positive Rate calculation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1093", "title": "Federated Training NPU Underutilization", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is this massive NPU compute resource severely underutilized during the local training phase?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1094", "title": "Hexagon NPU Thermal Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the system switch to a less efficient CPU fallback instead of running the primary model at 15 FPS on the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1095", "title": "Mobile NPU Operator Fusion Performance Anomaly", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can LayerNorm+MatMul fusion run slower than unfused ops despite reducing LPDDR memory traffic?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1096", "title": "Distilled vs Pruned Memory Bandwidth", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does Model B cause stuttering and what is the difference in memory bandwidth consumption?", "chain_ids": ["mobile-chain-auto-secondary-014-21"], "chain_positions": {"mobile-chain-auto-secondary-014-21": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1097", "title": "Exynos 2400 Shared Memory Exhaustion", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this specific performance cliff happen at this exact context length based on the hardware specs?", "chain_ids": ["mobile-chain-bucket-kvcachem-02"], "chain_positions": {"mobile-chain-bucket-kvcachem-02": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1098", "title": "Analyzing TTFT vs TPOT Bottlenecks", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the end-to-end latency of an on-device LLM become heavily bottlenecked during the decode phase (TPOT) rather than the prefill phase (TTFT)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1099", "title": "Hexagon NPU Memory Bandwidth Bottleneck Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does batch-size-1 INT8 sequence inference on the Hexagon NPU achieve only about 1.5 TOPS despite a 45 TOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1100", "title": "NPU Cold Start Latency with Memory-Mapped Weights", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming a 4KB page size and 4GB/s storage read speed, why does the NPU experience this first-run latency spike and what is the overhead?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1101", "title": "OOM Crash on A17 Pro During High-Res Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the OS kill the process during high-res inference, and how does the unified memory architecture contribute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1102", "title": "Mixed-Precision Bandwidth Contention on Exynos 2400", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does a 50/50 FP16-FP32 model suffer extra latency under Exynos ISP load, and how much extra bandwidth does a 10M parameter layer add?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1103", "title": "CI/CD to On-Device Performance Gap", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is a 3B INT8 model compute-bound in batched CI but only 1.6% utilized at batch size 1 on Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1104", "title": "CoreML Fallback Penalty on Apple A17 Pro", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does a 10% CPU fallback disproportionately degrade latency despite unified memory, and what is the new system throughput?", "chain_ids": ["mobile-chain-auto-001-02"], "chain_positions": {"mobile-chain-auto-001-02": 0}, "chain_tiers": {"mobile-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1105", "title": "Analyzing OOM for 3B FP16 Model on A17 Pro", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 6 GB FP16 3B model OOM on A17 Pro even though the phone has 8 GB unified memory?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 0}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1106", "title": "Investigating Tail Latency Spikes Under Memory Pressure", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the TPU utilization drop during memory pressure events, and what is the expected tail latency floor assuming a flash storage read speed of 2.5 GB/s?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1107", "title": "Dual-Core NPU Scheduling Bottleneck Analysis", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why this scheduling choice degrades performance on this specific architecture?", "chain_ids": ["mobile-chain-auto-secondary-012-07"], "chain_positions": {"mobile-chain-auto-secondary-012-07": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1108", "title": "Thermal Throttling on A17 Pro Neural Engine", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this sustained workload exceed the power budget and trigger thermal throttling on the shared unified memory architecture?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1109", "title": "A17 Pro Neural Engine Memory Bottleneck", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the Neural Engine underutilizing its 35 TOPS compute capacity, and what is the system bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1110", "title": "Unstructured Sparsity Inefficiency on Apple A17 Pro", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does unstructured sparsity fail to yield hardware performance gains, and what is the expected compute-bound execution time?", "chain_ids": ["mobile-chain-auto-secondary-006-31"], "chain_positions": {"mobile-chain-auto-secondary-006-31": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1111", "title": "INT8 Quantization Memory Bandwidth Analysis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does INT8 PTQ double throughput from 15 FPS to 30 FPS while compute utilization stays at 25%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1112", "title": "Jank Analysis of On-Device LLM on Tensor G3", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do frame drops and responsiveness warnings occur when Gemini Nano runs on Tensor G3's TPU during 60 FPS scrolling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1113", "title": "Guardrail Latency on Tensor G3 TPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a Tensor G3 guardrail miss a 100ms SLA while Gemini Nano generates a 500-token response?", "chain_ids": ["mobile-chain-auto-secondary-011-15"], "chain_positions": {"mobile-chain-auto-secondary-011-15": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1114", "title": "Diagnosing A17 Pro Neural Engine Utilization", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this image enhancement model reach only 10% of the A17 Pro Neural Engine's peak utilization at batch size 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1115", "title": "Unified Memory Contention Watchdog Reset", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this non-deterministic execution occur and what is the constrained memory bandwidth under contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1116", "title": "Continuous Video Feature Ingestion Power Analysis", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much power does continuous 60 FPS feature ingestion consume, and why does a concurrent game exceed the thermal budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1117", "title": "Thermal Throttling in Shared NPU/ISP Pipelines", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does peak Exynos 2400 NPU use throttle in this camera pipeline, and what NPU TOPS caps should the governor use across high-resolution and preview modes?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1118", "title": "KV-Cache OOM on A17 Pro Unified Memory", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 3B 4-bit transformer with 4000-token context OOM under a 2.5 GB background-app limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1119", "title": "Shadow Deployment Architecture for On-Device LLM", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro shadow deployment run a new translation model safely without disrupting the active model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1120", "title": "On-Device LLM Guardrails against Prompt Injection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a defense-in-depth strategy that robustly filters adversarial inputs without violating memory constraints or battery limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1121", "title": "Design On-Device Attention for Gemini Nano on Tensor G3", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an attention architecture that optimizes for the G3's memory constraints and TPU capabilities while maintaining acceptable summarization quality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1122", "title": "Architecting a Real-Time Video Segmentation Model for Exynos NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs of using larger feature maps early in the network versus aggressive downsampling given the shared memory bandwidth constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1123", "title": "Architecting On-Device LLM Workloads for Tensor G3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect an on-device intelligence pipeline for a messaging app running exclusively on a mobile TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1124", "title": "On-Device Coreset Selection Architecture", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the tradeoffs between heuristic filtering and model-based data selection to maximize the Information-Compute Ratio (ICR) without causing memory thrashing?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1125", "title": "Real-time Video Quality Gating on Exynos 2400", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Where should the quality validation run, and how does early frame rejection balance power against accuracy?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1126", "title": "On-Device Active Learning Data Curation for Gemini Nano", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect the data selection criteria and annotation workflow to filter high-value samples while minimizing battery impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1127", "title": "On-Device Drift Detection for Exynos 2400", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 camera pipeline detect drift on device without exporting raw frames or starving ISP/NPU bandwidth?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1128", "title": "On-Device Translation Architecture on Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is a specialized encoder-decoder architecture preferable to a unified decoder-only LLM for Tensor G3 live translation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1129", "title": "Always-On Vision Architecture for Exynos 2400", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an always-on continuous object tracking architecture for the Samsung Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1130", "title": "Architecting a Sub-4-bit LLM Pipeline for Samsung Exynos 2400", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the design choices between techniques like GPTQ and AWQ, considering the NPU's hardware execution profile?", "chain_ids": ["mobile-chain-auto-secondary-013-15"], "chain_positions": {"mobile-chain-auto-secondary-013-15": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1131", "title": "On-Device Intersectional Fairness Telemetry Pipeline", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the evaluation pipeline, detailing how you schedule the evaluation models, manage memory contention, and aggregate subgroup metrics securely?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1132", "title": "On-Device LLM Degradation Ladder for Tensor G3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you ensure the translation service remains fail-operational without degrading the overall OS experience under severe memory or thermal pressure?", "chain_ids": ["mobile-chain-auto-secondary-012-04"], "chain_positions": {"mobile-chain-auto-secondary-012-04": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1133", "title": "Designing a Distillation Pipeline for the A17 Pro Neural Engine", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a distilled LLM student be trained and sized to fit mobile memory, latency, and power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1134", "title": "Paged KV-Cache Architecture on A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile KV-cache manager support 8K contexts without triggering OOM or latency spikes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1135", "title": "Architecting Memory-Mapped Inference for Gemini Nano on Tensor G3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a memory-mapped weight loading strategy that minimizes cold-start delay for these apps while avoiding duplicate weight copies in the shared memory pool?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1136", "title": "Architecting Memory Management for On-Device LLMs", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a memory management strategy to handle the KV cache and weights without triggering OS-level OOM eviction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1137", "title": "Mixed-Precision Inference Architecture on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a 60 FPS A17 Pro segmentation model partition INT8 and FP16 work to balance speed, bandwidth, and accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1138", "title": "Architecting On-Device CI/CD for Exynos 2400 NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a CI/CD pipeline validate quantized on-device latency, accuracy, and shared-memory limits?", "chain_ids": ["mobile-chain-auto-secondary-006-19"], "chain_positions": {"mobile-chain-auto-secondary-006-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1139", "title": "Designing Graph Delegation for Snapdragon 8 Gen 3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a conversion and delegation strategy that safely evaluates tradeoffs between INT8 quantization, operator fallback penalties, and heterogeneous compute allocation?", "chain_ids": ["mobile-chain-auto-001-03"], "chain_positions": {"mobile-chain-auto-001-03": 2}, "chain_tiers": {"mobile-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1140", "title": "On-Device LLM Memory Architecture for Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the memory allocation, quantization strategy, and execution pipeline to safely deploy this model while maximizing the 45 TOPS INT8 compute available on the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1141", "title": "Exynos 2400 NPU Telemetry and Straggler Detection System", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should telemetry detect thermal throttling and memory stragglers with low reporting overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1142", "title": "Hardware-Aware NAS Design for A17 Pro Neural Engine", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should hardware-aware NAS target A17 Pro video super-resolution while respecting 60 FPS and memory-bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1143", "title": "Cross-Core Operator Scheduling for A17 Pro", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro operator scheduler maximize Neural Engine utilization without unified-memory thrashing or throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1144", "title": "Architecting Heterogeneous Profiling for NPU Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a Snapdragon 8 Gen 3 profiling system distinguish NPU compute limits, sync overhead, and memory contention?", "chain_ids": ["mobile-chain-auto-secondary-009-04"], "chain_positions": {"mobile-chain-auto-secondary-009-04": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1145", "title": "Architecting a Pruning Strategy for Hexagon NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect the end-to-end pruning strategy, detailing your choice of sparsity patterns and how you would align them with the Hexagon architecture to guarantee the speedup?", "chain_ids": ["mobile-chain-auto-secondary-006-32"], "chain_positions": {"mobile-chain-auto-secondary-006-32": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1146", "title": "Architecting On-Device Guardrails for Generative Text", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the system concurrently run a 3B generator and safety classifier without breaking latency or safety?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1147", "title": "Architecting a Fail-Safe ADAS Monitor on Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a Hexagon NPU dashcam pipeline detect NPU hangs or memory faults and enter a safe state within a 100ms fault-tolerant time interval?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1148", "title": "Real-Time Sensor Fusion Architecture on Snapdragon 8 Gen 3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an always-on Snapdragon 8 Gen 3 sensor-fusion pipeline ingest data and run inference without waking the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1149", "title": "Architecting Thermal-Aware Sustained Video Processing on A17 Pro", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile video super-resolution pipeline sustain 30 FPS under a strict continuous thermal envelope?", "chain_ids": ["mobile-chain-auto-secondary-013-23"], "chain_positions": {"mobile-chain-auto-secondary-013-23": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1150", "title": "LLM Memory Allocation on Snapdragon 8 Gen 3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a memory budgeting strategy for an on-device conversational AI on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1151", "title": "Shadow Deployment LMK Evictions on Tensor G3", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the Tensor G3 features 12 GB of LPDDR5X RAM and a 7.5 TOPS TPU, what hardware-level interaction is causing this shadow deployment to fail?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1152", "title": "Diagnosing High Latency in On-Device Gemini Nano Sanitization", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the interaction between the models and the hardware to identify the root cause of the latency?", "chain_ids": ["mobile-chain-auto-secondary-009-01"], "chain_positions": {"mobile-chain-auto-secondary-009-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1153", "title": "Diagnosing Low NPU Utilization in MobileNet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of the severe NPU underutilization and high latency in MobileNetV2 depthwise convolutions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1154", "title": "On-Device Fine-Tuning Battery Drain and Model Collapse", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this system failure and propose a data-centric solution?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1155", "title": "NPU Fallback from Data Contract Violations", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why do FP32 preprocessing violations push an INT8 model from 2 ms to over 60 ms?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1156", "title": "A17 Pro NPU Dataset Bias in Low-Light Portrait Segmentation", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this discrepancy between validation metrics and real-world performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1157", "title": "Diagnosing Drift Detection Latency Spikes on Hexagon NPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why the PSI calculation is causing system-wide jank on this heterogeneous architecture?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1158", "title": "Low Compute Utilization During On-Device Decoding", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the A17 Pro's 35 TOPS capability and 8 GB of shared unified memory, what architectural characteristic of the encoder-decoder model causes this specific hardware symptom?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1159", "title": "Diagnosing LLM Latency Bias on Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this disparity, and how do the hardware specifications explain the symptom?", "chain_ids": ["mobile-chain-auto-secondary-013-16"], "chain_positions": {"mobile-chain-auto-secondary-013-16": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1160", "title": "Neural Engine OOM During Model Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does graceful degradation from a heavy to lightweight A17 Pro model trigger OOM, and how should it be fixed?", "chain_ids": ["mobile-chain-auto-secondary-012-03"], "chain_positions": {"mobile-chain-auto-secondary-012-03": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1161", "title": "Diagnosing High Latency in Feature-Distilled Models", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What architectural trait inherited from feature distillation causes memory bandwidth saturation on a mobile NPU?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1162", "title": "Diagnosing OOM during long-context Gemini Nano inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely cause of this crash, and how would you diagnose the memory pressure during this long-context generation?", "chain_ids": ["mobile-chain-bucket-kvcachem-03"], "chain_positions": {"mobile-chain-bucket-kvcachem-03": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1163", "title": "Camera App Transition OOM on Exynos 2400", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does switching between the camera app and an Exynos 2400 segmentation app cause OOM despite the device having 12 GB of shared LPDDR5X memory?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1164", "title": "Diagnosing NaN Outputs in On-Device LLM", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a 1.8B INT8-FP16 LLM on Tensor G3 produce NaNs on prompts with large attention scores?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1165", "title": "CI/CD Deployment Fallback Regression", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What training-serving gap caused the Hexagon NPU canary latency regression from 5ms to over 80ms?", "chain_ids": ["mobile-chain-auto-secondary-006-21"], "chain_positions": {"mobile-chain-auto-secondary-006-21": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1166", "title": "Diagnosing Delegation Fallback on Exynos 2400", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is an INT8 TFLite vision model 5x slower with low NPU utilization and saturated LPDDR5X?", "chain_ids": ["mobile-chain-auto-001-04"], "chain_positions": {"mobile-chain-auto-001-04": 0}, "chain_tiers": {"mobile-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1167", "title": "Diagnosing OOM on Exynos 2400 Shared Memory", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 3B INT8 conversational model OOM on Exynos 2400 during camera viewfinder despite 12 GB shared memory?", "chain_ids": ["mobile-chain-auto-027-11"], "chain_positions": {"mobile-chain-auto-027-11": 0}, "chain_tiers": {"mobile-chain-auto-027-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1168", "title": "Diagnosing Shared LPDDR Contention During NPU Inference", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do NPU p99 latency stragglers align with heavy GPU UI animations, and how should they be diagnosed?", "chain_ids": ["mobile-chain-auto-secondary-006-23"], "chain_positions": {"mobile-chain-auto-secondary-006-23": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1169", "title": "Diagnosing TPU Utilization Drop in Gemini Nano Decoding", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What operator scheduling or graph execution issue is most likely causing this symptom?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1170", "title": "Diagnosing Latency Spikes on Exynos 2400", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the likely bottleneck causing these intermittent latency spikes, and how would you verify it using profiling tools?", "chain_ids": ["mobile-chain-auto-secondary-009-05"], "chain_positions": {"mobile-chain-auto-secondary-009-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1171", "title": "Guardrail CPU Fallback on Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does invoking a 100M FP32 toxicity guardrail max the CPU and stall a Snapdragon 8 Gen 3 LLM app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1172", "title": "Watchdog Resets from NPU Memory Contention", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware architecture bottleneck causes these sudden latency spikes and watchdog resets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1173", "title": "Diagnosing NPU Thermal Throttling During Continuous On-Device LLM Inference", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does on-device LLM summarization slow down 3x after 5 minutes of continuous inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1174", "title": "CNN Architecture Choice for Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which architectural design patterns are most optimal for maximizing utilization on this specific NPU, and what tradeoffs must you make?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1175", "title": "Evaluating On-Device Data Quality Gates", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate these alternatives, considering the shared memory architecture, and recommend the better design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1176", "title": "Shadow Deployment vs Canary for Snapdragon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which strategy is better suited for this hardware, shadow deployment or a progressive canary rollout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1177", "title": "Real-Time Video Ingestion Compute Utilization", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What compute utilization does a 60 FPS video ingestion pipeline require for 50 GOPs per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1178", "title": "Adversarial Randomized Smoothing Latency on A17 Pro", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What robust FPS and energy per prediction result from 10-pass randomized smoothing on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1179", "title": "On-Device Vision Pipeline Memory Bandwidth Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the total continuous memory bandwidth (in MB/s) consumed by this exact data pipeline sequence to keep the NPU fed?", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 1}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1180", "title": "Calculate Energy Cost of Memory vs Compute on Hexagon NPU", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy does the Hexagon NPU layer spend on INT8 compute versus LPDDR5X memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1181", "title": "Dynamic KV-Cache Paging for A17 Pro Unified Memory", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an iOS LLM engine manage KV-cache pages on the Apple A17 Pro's unified memory to avoid OOM while preserving Neural Engine utilization?", "chain_ids": ["mobile-chain-bucket-kvcachem-05"], "chain_positions": {"mobile-chain-bucket-kvcachem-05": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1182", "title": "Optimizing Intersectional Fairness Evaluation on Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the bottleneck and optimize this pipeline using the G3s hardware to quantify demographic parity and equalized odds efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1183", "title": "A17 Pro Mixed-Precision Bottleneck", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you identify the core bottleneck, and what speedup is expected by transitioning to an INT8 weight-only format?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1184", "title": "A17 Pro Neural Engine Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the A17 Pro voice assistant latency be decomposed for a 50-token prompt and 100-token response?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 4}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1185", "title": "Designing KV-Cache for Exynos 2400 NPU Shared Memory", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Exynos 2400 manage a 4096-token KV cache for a 7B LLM without unified-memory OOM kills?", "chain_ids": ["mobile-chain-bucket-kvcachem-02"], "chain_positions": {"mobile-chain-bucket-kvcachem-02": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1186", "title": "Exynos 2400 NPU Real-Time Power Budgeting", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify a power management strategy and deployment architecture that ensures sustained 30 FPS performance without exceeding the 2.5W cap?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1187", "title": "On-Device Guardrail Pipeline for Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a mobile chat app perform low-latency on-device PII and toxicity guardrails?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1188", "title": "Analyzing Shadow Deployment OOM on Exynos 2400", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does shadowing a 4GB language model beside a 2GB vision model OOM on Exynos 2400 during camera use?", "chain_ids": ["mobile-chain-auto-secondary-011-11"], "chain_positions": {"mobile-chain-auto-secondary-011-11": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1189", "title": "MHA vs GQA Decoding Bottlenecks", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the NPU sit idle despite having 45 TOPS available, and why does Grouped-Query Attention (GQA) significantly increase the token generation rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1190", "title": "INT8 Calibration Dataset Bias", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does calibrating INT8 detection on the first 1000 video frames cause a 40% nighttime accuracy drop?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 0}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1191", "title": "Encoder-Decoder Memory Bandwidth Advantage", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 1B/1B encoder-decoder translation model decode faster than a 2B decoder-only model on Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-013-08"], "chain_positions": {"mobile-chain-auto-secondary-013-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1192", "title": "Energy Cost of Memory vs Compute on Tensor G3", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a larger INT8 model that fits in SRAM use less energy than a smaller FP16 model fetching DRAM?", "chain_ids": ["mobile-chain-auto-secondary-013-13"], "chain_positions": {"mobile-chain-auto-secondary-013-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1193", "title": "3-bit vs 4-bit Unpacking Overhead on Tensor G3", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Explain why the smaller 3-bit model exhibits worse latency than the larger 4-bit model?", "chain_ids": ["mobile-chain-auto-secondary-013-14"], "chain_positions": {"mobile-chain-auto-secondary-013-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1194", "title": "Quantization Bias Under Memory Contention", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does falling back to static quantization disproportionately impact equalized odds for the minority demographic?", "chain_ids": ["mobile-chain-auto-secondary-013-18"], "chain_positions": {"mobile-chain-auto-secondary-013-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-18": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1195", "title": "FedAvg Memory Bottleneck on Hexagon NPU", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a FedAvg local epoch take 2.5 seconds instead of under 50ms despite high TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1196", "title": "Analyzing Hexagon NPU Fallback Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why should a throttled video pipeline fall back to a CPU model instead of a smaller NPU model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1197", "title": "TPU Kernel Fusion Limits", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the hardware constraints driving this compiler behavior and calculate the theoretical compute time for the linear layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1198", "title": "Distilled Model Memory Contention", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does latency spike during a live feed, and what is the effective memory bandwidth for loading the 25MB model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1199", "title": "Exynos Shared Memory KV-Cache Eviction", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Exynos 2400 transcription app hit OOM near 4,000 tokens only when the camera is active?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1200", "title": "Gemini Nano TTFT Decomposition on Tensor G3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What components make up the 600ms TTFT on the NPU, and which hardware limit dominates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1201", "title": "Analyzing Mmap Latency Spikes on Snapdragon 8 Gen 3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 500ms latency spike occur during the initial generation phase for a memory-mapped 2GB INT8 LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1202", "title": "Analyzing iOS Jetsam Eviction on Unified Memory", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this memory pressure eviction occur despite the theoretical availability within the 8 GB limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1203", "title": "Exynos NPU Inference Precision and Shared-Memory Contention", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the FP16 model suffer from high latency and low utilization despite the NPU having ample compute headroom?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1204", "title": "CI/CD to Tensor G3 Deployment Discrepancy", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did a Tensor G3 deployment pass emulated CI/CD tests but crash with OOM and latency spikes in production?", "chain_ids": ["mobile-chain-auto-secondary-006-18"], "chain_positions": {"mobile-chain-auto-secondary-006-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1205", "title": "CoreML ANE Fallback Latency Analysis", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this specific operator gap cause a massive latency degradation despite the fast unified memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1206", "title": "A17 Pro OOM Analysis with KV Cache Scaling", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Analyze why the dynamic inference memory footprint exceeds the limit during this specific generation task?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1207", "title": "Tensor G3 LLM Straggler Latency Analysis", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why this performance degradation occurs and explain the system behavior using the provided hardware specifications?", "chain_ids": ["mobile-chain-auto-secondary-006-22"], "chain_positions": {"mobile-chain-auto-secondary-006-22": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1208", "title": "NAS Preference for Compute-Bound Layers on Exynos", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the NAS optimizer heavily penalize lower-FLOP depthwise architectures on this specific SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1209", "title": "Dual-Core NPU Operator Scheduling on Exynos 2400", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does parallel operator scheduling on a multi-core mobile NPU counter-intuitively increase latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1210", "title": "Always-On Inference Power Discrepancy", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an always-on model needing only 0.35 TOPS add 1.2W of system power?", "chain_ids": ["mobile-chain-bucket-powerbud-03"], "chain_positions": {"mobile-chain-bucket-powerbud-03": 1}, "chain_tiers": {"mobile-chain-bucket-powerbud-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1211", "title": "Analyzing ANE Memory Bottlenecks in Transformers", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 500M INT8 language model take 10 ms/token despite a 35 TOPS NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1212", "title": "Unstructured Pruning Performance Regression on A17 Pro", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does 75% unstructured sparsity cause latency regression and high power draw on a mobile Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1213", "title": "Per-Channel INT8 Latency on Exynos 2400", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does changing quantization granularity cause this latency regression on this NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1214", "title": "Jank Analysis in Tensor G3 Camera Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the 33.3ms frame budget being missed, considering the shared memory architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1215", "title": "Guardrail Latency on Tensor G3", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a 2.5B INT8 Tensor G3 guardrail model see about 50 ms of latency before compute starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1216", "title": "A17 Pro Neural Engine Roofline Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the vision transformer reach only 7 TOPS, or 20% of peak, under roofline analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1217", "title": "Watchdog Resets on A17 Pro Neural Engine", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an A17 Pro driver-monitoring model miss a 15 ms watchdog heartbeat when CPU and GPU navigation load is high?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1218", "title": "A17 Pro Shared Memory Streaming Bottleneck", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does CPU-copying 24 MB 60Hz frames make a video pipeline miss its 16.6 ms budget despite ample NPU compute?", "chain_ids": ["mobile-chain-auto-secondary-012-11"], "chain_positions": {"mobile-chain-auto-secondary-012-11": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1219", "title": "Analyzing Sustained vs Burst Performance on Exynos 2400", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this performance degradation occur, and what is the theoretical sustained TOPS required to maintain the degraded framerate?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1220", "title": "Analyzing KV-Cache Constraints on A17 Pro", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the shared 8GB unified memory architecture become the primary bottleneck as sequence length increases?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1221", "title": "A17 Pro Unified Memory KV-Cache OOM", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 4096-token prompt OOM a 7B INT4 model even though the weights fit in memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1222", "title": "Progressive Rollout Design for A17 Pro Neural Engine", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro progressive rollout test a thermal-heavy video model without exhausting unified memory?", "chain_ids": ["mobile-chain-auto-secondary-011-12"], "chain_positions": {"mobile-chain-auto-secondary-011-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1223", "title": "Architecting On-Device LLM Defenses on Apple A17 Pro", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an on-device email summarizer defend against prompt injection using guardrails within memory and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1224", "title": "Architecting Long-Context Attention on Tensor G3", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Tensor G3 long-context summarization combine GQA, attention sinks, and sliding windows to fit 8K contexts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1225", "title": "Architecting an Efficient CNN for Exynos 2400 NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 object-detection CNN backbone reduce memory bandwidth while using the NPU effectively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1226", "title": "Architecting On-Device LLM Cost Strategy for Tensor G3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile deployment architect a 1.8B summarization model with INT4/INT8 tradeoffs and thermal fallback?", "chain_ids": ["mobile-chain-auto-secondary-004-07"], "chain_positions": {"mobile-chain-auto-secondary-004-07": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1227", "title": "On-Device Coreset Selection Architecture for Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you architect a system that evaluates the Information-Compute Ratio (ICR) of incoming data and selects a high-value coreset for on-device training without starving OS resources?", "chain_ids": ["mobile-chain-auto-secondary-014-17"], "chain_positions": {"mobile-chain-auto-secondary-014-17": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1228", "title": "Zero-Copy Data Pipeline for Snapdragon NPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a 4K60 vision pipeline use zero-copy buffers to feed the NPU efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1229", "title": "Edge Data Quality Gate Design for Exynos NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should the system validate ISP frames for blur and lighting before waking a heavy authentication model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1230", "title": "On-device Data Curation for Active Learning", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should Gemini Nano on Tensor G3 curate low-confidence smart replies locally without exposing private text?", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 2}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1231", "title": "On-Device Drift Detection Architecture for Shared Memory Systems", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Exynos 2400 perform on-device drift detection without caching raw frames or starving camera bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1232", "title": "Architecting Real-Time Translation on Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design an offline, real-time speech translation system for a device powered by the Google Tensor G3?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1233", "title": "Architecting an Energy-Efficient Real-time Translation Pipeline", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 translation pipeline (34.7 TOPS NPU, 12 GB shared RAM) minimize energy by balancing NPU compute and LPDDR5X access?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1234", "title": "Architecting Sub-4-bit LLM Deployment on Mobile NPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 7B translation LLM use extreme quantization while fitting memory and bandwidth limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1235", "title": "On-Device Fairness Evaluation Architecture", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an A17 Pro photography feature compute local fairness metrics under strict 100MB memory and 10ms latency constraints?", "chain_ids": ["mobile-chain-auto-secondary-013-17"], "chain_positions": {"mobile-chain-auto-secondary-013-17": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1236", "title": "Architecting Federated PEFT for LLMs on Tensor G3", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a cross-device federated learning system to personalize an on-device LLM (like Gemini Nano) for predictive text formatting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1237", "title": "Tensor G3 LLM Degradation Ladder", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation architecture that ensures users always receive smart reply suggestions within a strict 300ms latency budget, regardless of system stress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1238", "title": "Exynos 2400 Graph Partitioning", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 AOT compiler partition graphs and plan memory while the ISP handles 4K video?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1239", "title": "A17 Pro NPU Distillation Pipeline Design", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a 7B teacher be distilled into an A17 Pro student optimized for INT8 real-time inference?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1240", "title": "A17 Pro Unified Memory KV-Cache Architecture", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a memory-safe KV-cache allocation and eviction policy that allows processing up to 4,000 tokens of context without exceeding the remaining 500 MB budget?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 4}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1241", "title": "Zero-Copy Memory Mapping for Gemini Nano on Tensor G3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should edge system services share model weights to avoid exceeding RAM limits and TPU cold-start stalls?", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1242", "title": "Architecting OOM-Resilient On-Device LLM Inference", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an LLM assistant manage a growing KV cache under a dynamic 4 GB AI memory budget without triggering an OOM kill?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1243", "title": "Architecting Mixed-Precision Inference on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What mixed-precision strategy fits an A17 Pro segmentation model within 500MB while preserving accuracy and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1244", "title": "On-Device CI/CD for Video Super-Resolution", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the end-to-end MLOps lifecycle to automate the compilation, hardware-in-the-loop profiling, and deployment of these models targeting an NPU with shared memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1245", "title": "Android NPU Custom Attention Fallback Strategy", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a deployment handle unsupported custom attention operators without CPU fallback bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1246", "title": "Architecting a Vision-Language Model on Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 3B Snapdragon vision-language model fit a 4GB memory budget and handle bandwidth-bound decoding?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 2}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1247", "title": "On-Device Telemetry Design for Exynos 2400 Shared Memory", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Exynos 2400 telemetry detect memory-contention stragglers caused by ISP load without adding shared-memory overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1248", "title": "Heterogeneous Operator Scheduling for Neural Engine", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a scheduler partition video operators across compute units without exceeding shared memory bandwidth and thermal limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1249", "title": "Architecting Latency Profiling on Snapdragon 8 Gen 3 NPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a Snapdragon 8 Gen 3 profiler identify compute versus memory bottlenecks with under 2% bandwidth overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1250", "title": "Designing Structured Sparsity for NPU Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a pruning and sparsity architecture to fit this model within memory limits while maximizing NPU throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1251", "title": "On-Device Guardrails for GenAI on Exynos 2400", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a continuous monitoring system that evaluates the tradeoffs between sequential and concurrent guardrail execution to guarantee safety without degrading the user experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1252", "title": "Architecting a 4K Super-Resolution Pipeline on an NPU", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 4K super-resolution pipeline use roofline analysis to guarantee it stays compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1253", "title": "Architecting ASIL-B Driver Monitoring on Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the system to ensure deterministic execution, detailing memory isolation, watchdogs, and fallback if the NPU hangs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1254", "title": "Architecting Real-Time Audio Ingestion for Hexagon NPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the data ingestion, buffering, and processing pipeline, justifying your decisions on where each stage of the pipeline should execute to balance latency and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1255", "title": "Architecting a Thermal-Aware Real-Time Video Processing Pipeline", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Propose a system-level design to maintain acceptable real-time performance indefinitely under sustained thermal constraints without severely degrading the user experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1256", "title": "Diagnosing OOM in On-Device LLM Shadow Rollout", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Tensor G3 shadow rollout of an LLM cause OOM and UI stutter, and what rollout architecture avoids it?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1257", "title": "Diagnosing Thermal Throttling from Adversarial Inputs on Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do certain Gemini Nano inputs trigger rapid thermal throttling and throughput collapse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1258", "title": "A17 Pro Neural Engine Latency Spike Diagnosis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can Neural Engine latency spike above 40ms for a 150 GOPS INT8 segmentation model expected at 4.5ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1259", "title": "Diagnosing Model Collapse on Tensor G3", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a personalized writing assistant become repetitive after federated updates with synthetic data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1260", "title": "On-Device Active Learning OOM Crashes", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can a 50MB CoreML selection model still trigger OOM crashes during camera-frame curation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1261", "title": "Diagnosing On-Device PSI Drift OOM on Snapdragon", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does background PSI drift detection cause OOM on Snapdragon 8 Gen 3 despite stable INT8 inference?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1262", "title": "Diagnosing Thermal Throttling from Memory Bound NPU Workloads", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Snapdragon 8 Gen 3 ViT drain battery and throttle after 10 minutes despite meeting FPS and TOPS targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1263", "title": "Diagnosing Latency Bias in On-Device LLMs", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Gemini Nano transcription on Tensor G3 show higher latency and battery drain for AAVE than SAE transcripts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1264", "title": "Diagnosing A17 Pro Neural Engine OOM Under Thermal Throttling", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an A17 Pro video app encounter sudden OOM crashes during thermal degradation after a period of stable memory use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1265", "title": "Diagnosing Slow Distilled LLM Generation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why is a 1B distilled LLM on Tensor G3 barely faster than a 3B pruned model during autoregressive generation?", "chain_ids": ["mobile-chain-auto-secondary-014-20"], "chain_positions": {"mobile-chain-auto-secondary-014-20": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1266", "title": "Diagnosing Page Thrashing on Apple A17 Pro", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an mmap-backed 3GB A17 Pro language model spike to over 2.5 seconds after switching from the camera app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1267", "title": "Diagnosing OOM Evictions on Shared Memory Exynos 2400", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of these sudden evictions considering the system's shared memory architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1268", "title": "Diagnosing FP8 Activation Overflow on Tensor G3 TPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is causing this catastrophic degradation, and how would you resolve it while maintaining target latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1269", "title": "CI/CD Pipeline Graph Break Diagnosis", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What pipeline validation flaw led to this discrepancy, and what is the physical root cause on the device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1270", "title": "NPU Memory Bandwidth Contention", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Exynos 2400 NPU P99 latency spike 5x when the ISP processes high-resolution video?", "chain_ids": ["mobile-chain-auto-001-09"], "chain_positions": {"mobile-chain-auto-001-09": 0}, "chain_tiers": {"mobile-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1271", "title": "Diagnosing OOM on Shared Memory NPU Architectures", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an 8B INT8 Exynos 2400 LLM crash when a camera-based multimodal feature starts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1272", "title": "Diagnosing Latency Spikes on Snapdragon 8 Gen 3", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the root cause of these symptoms using your knowledge of this heterogeneous architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1273", "title": "Shared Memory Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the hardware architecture, how do you diagnose the root cause of this latency degradation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1274", "title": "On-Device PII Guardrail Starvation on Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 3B INT8 assistant on Snapdragon 8 Gen 3 starve a lightweight PII guardrail despite 30% NPU utilization?", "chain_ids": ["mobile-chain-auto-secondary-011-13"], "chain_positions": {"mobile-chain-auto-secondary-011-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1275", "title": "Diagnosing Exynos NPU Latency Spikes", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an Exynos 2400 safety model occasionally spike from 12ms to over 40ms, and how can determinism be restored?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1276", "title": "NPU Frame Drops During Real-Time Sensor Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does NPU frame processing stutter at 30 FPS with only 30% NPU utilization but high memory latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1277", "title": "Diagnosing TPU Throttling During Sustained Gemini Nano Generation", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Gemini Nano on Tensor G3 slow by over 60% after 45s while LPDDR5X memory use stays flat?", "chain_ids": ["mobile-chain-auto-secondary-013-21"], "chain_positions": {"mobile-chain-auto-secondary-013-21": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1278", "title": "Evaluating Adversarial Defenses on Snapdragon NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which adversarial defense is more efficient on Snapdragon 8 Gen 3: an INT8 NPU ensemble or randomized smoothing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1279", "title": "Evaluating Attention Variants for Exynos 2400 NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the memory bandwidth and compute tradeoffs for each variant during the decoding phase and recommend the best architecture for this specific SoC?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1280", "title": "Evaluate Vision Transformer Deployment Cost on Exynos 2400 NPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model do you choose for a target 60 FPS constraint, and how do you justify the compute cost tradeoffs?", "chain_ids": ["mobile-chain-auto-secondary-004-09"], "chain_positions": {"mobile-chain-auto-secondary-004-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1281", "title": "Evaluating Data Pruning for On-Device NPU Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which data selection approach is better suited for the heterogeneous Snapdragon 8 Gen 3 environment: CPU-based coreset selection via embeddings or NPU-based gradient loss selection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1282", "title": "Calibration Data Selection for INT8 NPU Execution", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which calibration dataset strategy is better for maximizing overall INT8 accuracy on this dual-core NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1283", "title": "On-Device Drift Detection Alternatives", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which local drift detection approach is optimal for the Apple A17 Pro considering power and memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1284", "title": "On-Device Summarization Architecture Selection", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "For Exynos 2400 meeting summarization, should a 2B INT8 model be decoder-only or encoder-decoder to minimize memory bandwidth contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1285", "title": "Evaluating DRAM vs Compute Energy for A17 Pro", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model do you choose to maximize battery life, and why?", "chain_ids": ["mobile-chain-auto-secondary-013-12"], "chain_positions": {"mobile-chain-auto-secondary-013-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1286", "title": "Thermal Degradation on Shared Memory NPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Under a 50% NPU throttle, should tracking load an INT8 fallback model or drop from 60 FPS to 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1287", "title": "Distillation vs Pruning for Snapdragon Hexagon NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should you use 70% unstructured pruning or a 3x smaller dense student for 10ms video segmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1288", "title": "Evaluating PagedAttention vs Static Allocation for On-Device LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a Snapdragon 8 Gen 3 7B LLM use a 2 GB contiguous KV cache or PagedAttention within an 8 GB app budget?", "chain_ids": ["mobile-chain-bucket-kvcachem-05"], "chain_positions": {"mobile-chain-bucket-kvcachem-05": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1289", "title": "LLM KV Cache Allocation Strategy on Tensor G3", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 1.8B LLM on Tensor G3 use static or PagedAttention-style KV allocation under a 2.5 GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1290", "title": "Evaluating Mixed-Precision ViT on Snapdragon 8 Gen 3", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which precision strategy is most defensible for a 1B ViT targeting 30 FPS on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1291", "title": "LLM TPU Delegation vs CPU Fallback on Tensor G3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which TFLite strategy is faster: 100% TPU delegation with 2.2B params or 30% CPU fallback at 2.0B?", "chain_ids": ["mobile-chain-auto-001-08"], "chain_positions": {"mobile-chain-auto-001-08": 1}, "chain_tiers": {"mobile-chain-auto-001-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1292", "title": "On-Device LLM Evaluation for Tensor G3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Tensor G3 summarization model option should be chosen after considering weights, KV cache, and TPU constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1293", "title": "Heterogeneous Pipelining on Snapdragon 8 Gen 3", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 30 FPS segmentation graph run the FP16 head on CPU sequentially or pipeline it on GPU?", "chain_ids": ["mobile-chain-auto-secondary-012-08"], "chain_positions": {"mobile-chain-auto-secondary-012-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-08": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1294", "title": "On-Device LLM Profiling Alternatives for Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is Tensor G3 LLM generation at 150 ms/token limited by 7.5 TOPS TPU compute or 12 GB LPDDR5X bandwidth?", "chain_ids": ["mobile-chain-auto-secondary-009-03"], "chain_positions": {"mobile-chain-auto-secondary-009-03": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1295", "title": "Evaluating On-Device Toxicity Guardrails for A17 Pro", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which toxicity guardrail design is better for a mobile deployment prioritizing strict PII privacy and low latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1296", "title": "Safety-Critical LLM Fallback Design on Tensor G3", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Tensor G3 hazard-alert fallback design meets a 150ms safety recovery SLA, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1297", "title": "Continuous Vision Thermal Throttling Strategy", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which thermal strategy better sustains continuous 30 FPS segmentation: burst-and-sleep or minimum-clock continuous execution?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1298", "title": "Evaluating Shadow vs. Canary on Snapdragon 8 Gen 3", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a Snapdragon 8 Gen 3 video model rollout use shadow deployment or canary, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1299", "title": "A17 Pro Neural Engine vs GPU for Real-Time Video", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a segmentation model with GPU-supported custom ops run on the GPU or the NPU?", "chain_ids": ["mobile-chain-auto-secondary-007-12"], "chain_positions": {"mobile-chain-auto-secondary-007-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1300", "title": "Mmap Strategies for Shared Memory", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an 8GB FP16 model on a shared-memory architecture use mmap plus mlock or demand-paged mmap for predictable cold starts?", "chain_ids": ["mobile-chain-auto-secondary-014-05"], "chain_positions": {"mobile-chain-auto-secondary-014-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-05": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1301", "title": "CI/CD for On-Device Models on Apple A17 Pro Neural Engine", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate these proposals and determine which is better for production deployment?", "chain_ids": ["mobile-chain-auto-secondary-006-20"], "chain_positions": {"mobile-chain-auto-secondary-006-20": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1302", "title": "Hardware-Aware NAS for A17 Pro Neural Engine", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which A17 Pro NAS proposal yields lower real latency, FLOP-minimizing MBConv or hardware-profiled dense search within a ~5W power budget?", "chain_ids": ["mobile-chain-auto-secondary-010-12"], "chain_positions": {"mobile-chain-auto-secondary-010-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-010-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1303", "title": "A17 Pro Canary Rollout Performance Budget", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum FPS and memory footprint define the A17 Pro canary model theoretical performance budget if the model requires 0.5 TOPS/frame and 15% memory?", "chain_ids": ["mobile-chain-auto-secondary-011-12"], "chain_positions": {"mobile-chain-auto-secondary-011-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1304", "title": "A17 Pro Memory Bounds for Adversarial Defense", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum adversarial-training batch size fits on the target device after OS, model, and per-sample overheads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1305", "title": "Depthwise Separable Speedup on Exynos NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many MACs do standard and depthwise separable convolutions require, and what is the estimated latency assuming the NPU runs at 10% utilization of its 34.7 TOPS peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1306", "title": "Tensor G3 Gemini Nano Prefill Latency", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the pure compute latency for Tensor G3 to prefill a 500-token prompt with a 1.5B INT8 Gemini Nano model?", "chain_ids": ["mobile-chain-auto-secondary-004-07"], "chain_positions": {"mobile-chain-auto-secondary-004-07": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1307", "title": "On-Device Coreset Sizing for A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum coreset size fits a 2-second fine-tuning budget over 10 epochs on a 35 TOPS NPU?", "chain_ids": ["mobile-chain-auto-secondary-014-17"], "chain_positions": {"mobile-chain-auto-secondary-014-17": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1308", "title": "NPU Data Starvation and Pipelined Throughput", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What FPS results from sequential versus double-buffered CPU/NPU processing on a 45 TOPS NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1309", "title": "On-Device Data Quality Gate Compute Budget", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How much Exynos 2400 NPU capacity is consumed by four concurrent anomaly-detection quality-gate streams?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1310", "title": "On-Device Active Learning Compute for Curation", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Calculate the total daily compute time in seconds required on the 7.5 TOPS TPU to score and curate this dataset?", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 0}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1311", "title": "On-Device KL Divergence Calculation", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you compute KL(P||Q) for the 3-bin distributions and decide whether the 34.7 TOPS NPU is compute- or LPDDR5X-bandwidth-bound?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1312", "title": "Encoder vs Decoder Prefill Compute on Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do decoder-only and encoder-decoder models compare on prefill compute and theoretical minimum latency on the Tensor G3?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1313", "title": "Energy Cost of Memory vs Compute on Exynos 2400", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much per-inference energy comes from compute versus LPDDR5X memory access for a 50M parameter INT8 model on Exynos 2400?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1314", "title": "3-bit AWQ Footprint on Exynos 2400", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the memory footprint of the weights and the compute required (in TOPS) to generate 20 tokens per second, and evaluate if the NPU compute capacity is a limiting factor?", "chain_ids": ["mobile-chain-auto-secondary-013-15"], "chain_positions": {"mobile-chain-auto-secondary-013-15": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1315", "title": "On-Device Equal Opportunity Evaluation Performance", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the required compute time and energy to run a 14,000-image Equal Opportunity fairness audit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1316", "title": "On-Device LLM Battery Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does the proposed degradation ladder keep the system fail-operational under the 15% battery fail-safe mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1317", "title": "Tensor G3 Constant Folding Latency", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many milliseconds of compute latency does a 20% constant folding reduction save?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1318", "title": "A17 Pro NPU Distillation Throughput", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum IPS and energy per inference does the distilled student achieve at 60% NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1319", "title": "KV-Cache Sizing for A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory does a 4096-token 7B LLM need (32 layers, 32 heads, 128 head dimension), and does it fit in the remaining RAM?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1320", "title": "Memory-Mapped On-Device LLM Initialization", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the required memory bandwidth to achieve 15 tokens per second and determine if the physical RAM is sufficient to hold the model alongside runtime activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1321", "title": "NPU Shared Memory Batching Limit", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum batch size you can safely process in a single pass before triggering OS-level memory eviction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1322", "title": "FP16 Memory and Compute on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Do FP16 weights for a 1.5B A17 Pro model fit in memory, and what is the compute-bound step throughput?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1323", "title": "CI/CD Compute Gating for Exynos NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum theoretical FPS this model can achieve if the CI/CD gate policy strictly requires leaving 60% of the NPU compute available for concurrent ISP workloads?", "chain_ids": ["mobile-chain-auto-secondary-006-19"], "chain_positions": {"mobile-chain-auto-secondary-006-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1324", "title": "Hexagon NPU Delegation Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total inference latency given the CPU fallback and transition overheads?", "chain_ids": ["mobile-chain-auto-001-03"], "chain_positions": {"mobile-chain-auto-001-03": 0}, "chain_tiers": {"mobile-chain-auto-001-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1325", "title": "INT8 Model Memory Footprint on Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What total memory footprint does a 3B INT8 model need with 25% KV/activation overhead, and does it fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1326", "title": "Parallel Schedule Energy Latency", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What inference latency and energy result from parallelizing 105 GOPs on the A17 Pro Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1327", "title": "Hexagon NPU Profiling Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the Hexagon NPU theoretical latency and utilization for a 225 GOPS model measured at 15ms?", "chain_ids": ["mobile-chain-auto-secondary-009-04"], "chain_positions": {"mobile-chain-auto-secondary-009-04": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1328", "title": "Structured Pruning Latency on Hexagon NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the exact NPU utilization percentage required for this model after pruning, given the Hexagon's peak INT8 compute capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1329", "title": "On-Device Guardrail Latency and Memory", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the memory footprint and theoretical compute latency for a 1B INT8 guardrail classifier on the Exynos 2400?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1330", "title": "Watchdog Timer Sizing for Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What watchdog timeout should be set for a 90 GOPS Hexagon NPU frame with a 3x contention safety margin?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1331", "title": "Calculate sustained FPS under A17 Pro thermal throttling", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What sustained FPS can an A17 Pro object tracker reach under a 1.5W ANE thermal power limit, assuming linear power scaling?", "chain_ids": ["mobile-chain-auto-secondary-013-23"], "chain_positions": {"mobile-chain-auto-secondary-013-23": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1332", "title": "A17 Pro KV-Cache Memory Budgeting", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What memory is needed for 2B INT8 weights plus a 1000-token FP16 KV cache, and does it fit within a 3GB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1333", "title": "LLaMA-3 8B INT8 Quantization Memory Budget on Snapdragon 8 Gen 3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the memory footprint for an 8B INT8 model plus a 2048-token KV cache, and does it fit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1334", "title": "Exynos 2400 NPU vs CPU Audio Offloading", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the Exynos 2400 NPU handle a 4 TOPS audio workload after sustained bandwidth limits, and what utilization is needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1335", "title": "Shadow Deployment Compute Utilization on Hexagon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What Hexagon NPU compute utilization results from running Models A and B together at 30 FPS in shadow mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1336", "title": "A17 Pro NPU Latency and Energy Calculation", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the expected inference latency in milliseconds and the energy consumed per inference in millijoules to quantify the benefit of NPU offloading?", "chain_ids": ["mobile-chain-auto-secondary-007-12"], "chain_positions": {"mobile-chain-auto-secondary-007-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1337", "title": "KV Cache Capacity on Exynos 2400", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum sequence length you can support if you implement standard Multi-Head Attention (MHA) versus Multi-Query Attention (MQA) with 1 KV head?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1338", "title": "MobileNetV2 GOPs vs Apple A17 Pro TOPS", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical maximum FPS can a 1.5 GOP MobileNetV2 model achieve on A17 Pro at 30% utilization?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1339", "title": "Estimating Inference FPS on Exynos 2400 NPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What FPS can the Exynos 2400 process for a 150 GOPS super-resolution model at 35% utilization?", "chain_ids": ["mobile-chain-auto-secondary-004-09"], "chain_positions": {"mobile-chain-auto-secondary-004-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-09": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1340", "title": "Calculate Data Pruning Ratio for On-Device Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What pruning ratio is needed if only the retained coreset is processed in the 10-minute budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1341", "title": "Tensor G3 TPU Pipeline Throughput Calculation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum sensor data throughput can Tensor G3 sustain at 80% TPU utilization and 50,000 ops per byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1342", "title": "Compute Budget for On-Device Frame Quality Gate", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What per-frame compute budget remains for a data quality gate after the main 60 FPS model uses 30 TOPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1343", "title": "On-Device Active Learning Throughput", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "If your uncertainty-scoring model requires 12.5 GOPS per inference, how many frames per minute can you evaluate to curate your dataset?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1344", "title": "On-Device MMD Drift Window Sizing", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many embeddings and minutes of history fit in a 50MB A17 Pro drift-detection window at 30 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1345", "title": "Compute Latency on Exynos 2400 NPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Assuming a peak compute of 34.7 TOPS, how would you calculate the theoretical compute-bound inference latency for both architectures, and find the absolute difference in milliseconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1346", "title": "Calculate NPU Compute and Memory Energy Per Inference", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much per-inference energy does the A17 Pro model spend on NPU compute and unified-memory weight fetches?", "chain_ids": ["mobile-chain-auto-secondary-013-12"], "chain_positions": {"mobile-chain-auto-secondary-013-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1347", "title": "Calculating 3-bit AWQ Memory Footprint", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total memory footprint of the weights and metadata, and does it fit within the 4.0 GB allocation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1348", "title": "Quantifying NPU Quantization Bias", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the Predictive Equality (False Positive Rate difference) gap for both models to quantify the fairness degradation?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1349", "title": "Calculating Fallback Model Budget on Exynos 2400", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum GOPS per frame a fallback model can require to sustain a strict 30 FPS under 40% throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1350", "title": "Calculate Max FPS After Operator Fusion on ANE", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum FPS can the A17 Pro compiled video model achieve after a 20% operation reduction?", "chain_ids": ["mobile-chain-auto-secondary-003-06"], "chain_positions": {"mobile-chain-auto-secondary-003-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-003-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1351", "title": "Distillation Target Latency on Hexagon NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What maximum GOP per frame can a distilled Hexagon NPU speech enhancement student use to meet a 5ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1352", "title": "KV-Cache Memory Sizing for Hexagon NPU", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the exact memory required for the KV-cache of a 3B LLM (4096 tokens, FP16) on an NPU, and does it fit in 2.0 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1353", "title": "Calculate Maximum Context Length for a Mobile INT4 LLM", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What maximum context window can a 1.8B INT4 mobile LLM service support under a 2.0 GB memory limit?", "chain_ids": ["mobile-chain-auto-secondary-014-24"], "chain_positions": {"mobile-chain-auto-secondary-014-24": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1354", "title": "Mixed-Precision Footprint on Snapdragon Hexagon NPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the exact mixed-precision model weight footprint and the theoretical minimum INT8 compute time per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1355", "title": "CI/CD Latency Gating for A17 Pro Neural Engine", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 450 GOPS candidate model pass a 50ms CI/CD latency gate at 30% hardware utilization on a 35 TOPS NPU?", "chain_ids": ["mobile-chain-auto-secondary-006-20"], "chain_positions": {"mobile-chain-auto-secondary-006-20": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1356", "title": "Tensor G3 LLM Memory Budgeting", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum standard quantization precision (in bits) we can use for the model weights to guarantee the model fits entirely within the remaining available RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1357", "title": "Hardware-Aware NAS Latency on Hexagon NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What is the theoretical compute latency for a 90M-MAC INT8 block on a 45 TOPS NPU?", "chain_ids": ["mobile-chain-auto-secondary-010-13"], "chain_positions": {"mobile-chain-auto-secondary-010-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-010-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1358", "title": "Operator Fusion Latency on Hexagon NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What execution time do unfused and fully fused NPU schedules take for the three-operator sequence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1359", "title": "Calculate Prefill Latency Bound on Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical minimum compute latency in milliseconds to process a 100-token prompt?", "chain_ids": ["mobile-chain-auto-secondary-009-03"], "chain_positions": {"mobile-chain-auto-secondary-009-03": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1360", "title": "Calculate 2:4 Structured Sparsity Speedup on Tensor G3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How does 2:4 structured sparsity change memory footprint and workload-effective TOPS for this LLM deployment?", "chain_ids": ["mobile-chain-auto-secondary-006-33"], "chain_positions": {"mobile-chain-auto-secondary-006-33": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1361", "title": "On-Device Guardrail Compute and Memory Sizing", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the static memory footprint of the guardrail weights and the theoretical minimum compute latency for one 128-token chunk, assuming 100% Neural Engine utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1362", "title": "Calculate Weight-Streaming LLM Inference Bound", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What operational intensity and token generation rate bound apply to batch-1 weight-streaming LLM inference?", "chain_ids": ["mobile-chain-bucket-roofline-04"], "chain_positions": {"mobile-chain-bucket-roofline-04": 0}, "chain_tiers": {"mobile-chain-bucket-roofline-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1363", "title": "Deterministic Execution Timing for On-Device LLM Watchdog", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What watchdog timeout should cover a 20-token emergency LLM response at 50% of a 7.5 TOPS mobile TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1364", "title": "On-Device Real-Time Sensor Ingestion Throughput for G3 TPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the sensor ingestion buffer and throughput be sized for real-time health monitoring on a mobile TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1365", "title": "Hexagon NPU Burst Thermal Limit Calculation", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum number of tera-operations the NPU can execute in peak burst state before thermal throttling engages?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1366", "title": "Calculate KV Cache Memory Footprint on Google Tensor G3", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much FP16 KV-cache memory is required for a single generation request reaching a sequence length of 2048 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1367", "title": "On-Device Shadow Rollout for A17 Pro Video Segmentation", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an A17 Pro shadow rollout for video segmentation without frame drops or thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1368", "title": "Exynos 2400 NPU Heterogeneous Pipeline Design", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an Exynos 2400 heterogeneous pipeline map dense, dynamic, and preprocessing work across its processors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1369", "title": "On-Device Defensive Architecture for Biometric Spoofing", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design an on-device defense against biometric spoofing for the A17 Pro Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1370", "title": "Designing a 32K Context Window for Google Tensor G3", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a novel attention architecture and memory management strategy that fits a 32K context window within a 2 GB budget while maximizing TPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1371", "title": "Exynos 2400 NPU Video Enhancement CNN Architecture", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a custom CNN architecture that maximizes utilization of the NPU's compute without bottlenecking the shared LPDDR5X memory during high-bandwidth concurrent ISP operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1372", "title": "On-Device LLM Sizing for Google Tensor G3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the model be sized and optimized across prefill, decode, and memory limits to meet the 2-second target?", "chain_ids": ["mobile-chain-auto-secondary-004-07"], "chain_positions": {"mobile-chain-auto-secondary-004-07": 2}, "chain_tiers": {"mobile-chain-auto-secondary-004-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1373", "title": "On-Device Data Pruning for Continual Learning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you build an on-device data pruning pipeline for continual learning without exceeding mobile resource budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1374", "title": "A17 Pro Unified Memory Pipeline Optimization", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a zero-copy data pipeline that maximizes throughput and minimizes power consumption while ensuring the NPU is never starved for data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1375", "title": "On-Device Data Validation for Continuous Learning", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a multi-stage data validation pipeline for continuous learning that strictly bounds memory bandwidth and NPU utilization?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1376", "title": "On-Device LLM Active Learning Curation on Tensor G3", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should an on-device active learning system curate LLM personalization data on Tensor G3?", "chain_ids": ["mobile-chain-auto-027-13"], "chain_positions": {"mobile-chain-auto-027-13": 3}, "chain_tiers": {"mobile-chain-auto-027-13": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1377", "title": "On-Device DP-SGD on Hexagon NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you combine per-example clipping, secure noise, and INT8 quantization on a Hexagon NPU without weakening DP?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 3}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1378", "title": "On-Device ISP Concept Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you detect ISP concept drift without dropping 4K 60 FPS camera frames?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1379", "title": "On-Device Real-Time Translation Architecture for Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a model and execution strategy to guarantee continuous translation under 100ms per utterance?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1380", "title": "Energy-Aware Always-On Wake Vision Cascade", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a 50 mW always-on wake vision cascade while minimizing DRAM access?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1381", "title": "Sub-3-bit LLM Deployment on Exynos NPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can an 8B assistant be quantized below 4 bits to fit the Exynos 2400 memory budget during heavy camera use?", "chain_ids": ["mobile-chain-auto-secondary-013-15"], "chain_positions": {"mobile-chain-auto-secondary-013-15": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-15": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1382", "title": "On-Device Intersectional Fairness", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the evaluation architecture for the A17 Pro to compute parity metrics across 16 subgroups without causing thermal throttling or battery drain?", "chain_ids": ["mobile-chain-auto-secondary-013-17"], "chain_positions": {"mobile-chain-auto-secondary-013-17": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1383", "title": "On-Device LLM Federated Personalization on Tensor G3", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you personalize an on-device LLM with federated LoRA under a 10 MB daily upload limit?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1384", "title": "Tensor G3 On-Device LLM Degradation Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you build a fail-operational degradation ladder for Tensor G3 translation and summarization under thermal and RAM pressure?", "chain_ids": ["mobile-chain-auto-secondary-012-04"], "chain_positions": {"mobile-chain-auto-secondary-012-04": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1385", "title": "A17 Pro Asymmetric Distillation for ASR", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you distill an ASR model asymmetrically for efficient execution on the NPU?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1386", "title": "Real-time Multimodal Assistant Latency on Hexagon NPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you schedule a real-time multimodal assistant on Hexagon NPU while avoiding shared memory bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1387", "title": "LLM Memory Co-Design on Exynos 2400 NPU", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design a memory hierarchy and execution architecture to guarantee 20 tokens/s generation without dropping camera frames?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 3}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1388", "title": "Zero-Copy LLM Architecture on Google Tensor G3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a memory-mapped inference architecture to achieve zero-copy weight sharing across processes?", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1389", "title": "On-Device Fine-Tuning Memory Orchestration", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should memory be orchestrated for on-device fine-tuning with a 4-bit base model and LoRA adapters to maintain the footprint under 2.0 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1390", "title": "On-Device FP8 Inference Design for LLM on A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design mixed-precision FP8 activation inference for an on-device LLM on A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1391", "title": "On-Device Multimodal Architecture for Snapdragon 8 Gen 3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect a quantization and memory allocation strategy to fit a 7B LLM and ViT within 12-16GB RAM and 64GB/s while targeting 15 tok/s and 30 FPS?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 3}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1392", "title": "Edge Telemetry Architecture for On-Device LLMs", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design edge telemetry to isolate memory, thermal, or OS scheduler stragglers without hurting latency or uploading raw traces?", "chain_ids": ["mobile-chain-auto-secondary-006-22"], "chain_positions": {"mobile-chain-auto-secondary-006-22": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1393", "title": "Hardware-Aware NAS for Mobile Video Segmentation", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS strategy that optimizes latency, DRAM bandwidth, and on-chip buffer reuse?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1394", "title": "Heterogeneous Scheduling for Multimodal Pipeline", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a heterogeneous multimodal pipeline be scheduled to reduce memory traffic and NPU stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1395", "title": "Always-On Video NPU Power Optimization", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you optimize power for always-on video analytics by pacing NPU work instead of racing to idle?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 4}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1396", "title": "Heterogeneous Pipeline Profiling on Snapdragon 8 Gen 3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you profile this heterogeneous pipeline to find NPU stalls and memory bottlenecks, and what architectural changes eliminate the 15ms overhead?", "chain_ids": ["mobile-chain-auto-secondary-009-04"], "chain_positions": {"mobile-chain-auto-secondary-009-04": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1397", "title": "On-Device Guardrail Architecture for Exynos", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a concurrent architecture that ensures strict policy enforcement without degrading the user experience or starving the ISP during multimodal tasks?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 3}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1398", "title": "Safety-Critical ASIL-D Pedestrian Detection on Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect an ASIL-D pedestrian detection pipeline on Hexagon NPU with deterministic execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1399", "title": "Continuous On-Device Sensor Fusion and Inference Pipeline", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you build a continuous sensor fusion pipeline using low-power hubs, zero-copy buffers, and NPU bursts on a 45 TOPS INT8 NPU with 12-16 GB LPDDR5X memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1400", "title": "Continuous Real-Time Vision Under Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a thermal-aware execution architecture that sustains 30 FPS while avoiding thermal throttling and frame drops?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1401", "title": "On-Device LLM Budgeting for Snapdragon 8 Gen 3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a 7B on-device LLM be budgeted under a 6 GB memory cap while supporting a 4096-token context window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1402", "title": "A/B Rollout Memory Bottleneck on Exynos 2400", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the A/B rollout memory bottleneck on Exynos 2400 and how should it be fixed?", "chain_ids": ["mobile-chain-auto-secondary-011-11"], "chain_positions": {"mobile-chain-auto-secondary-011-11": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1403", "title": "Optimizing Monte Carlo Dropout on Mobile NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the 5-pass Monte Carlo bottleneck on a shared-memory NPU, and how can it be optimized below 30ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1404", "title": "Optimizing Inverted Residuals on Google Tensor G3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is this block design inefficient on the Tensor G3, and what structural change improves utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1405", "title": "Diagnosing NPU Compute Bottlenecks on Snapdragon 8 Gen 3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the compute precision and shared memory architecture to diagnose the bottleneck and quantify the fix?", "chain_ids": ["mobile-chain-auto-secondary-004-08"], "chain_positions": {"mobile-chain-auto-secondary-004-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1406", "title": "On-Device Coreset Selection for Nightly Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should nightly fine-tuning data be scored and selected on-device using the Exynos 2400 dual-core NPU within a 100 TOPs budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1407", "title": "Optimizing On-Device Data Quality Gates for LLMs", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the bottleneck in this data validation pipeline and quantify an optimized on-device quality gate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1408", "title": "Optimizing On-Device Calibration Data Curation for NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate calibration data for NPU quantization without loading the full image corpus into memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1409", "title": "Optimizing On-Device Drift Detection for Google Tensor G3", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this computational bottleneck and quantify an optimized on-device drift detection strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1410", "title": "Optimizing Encoder-Decoder on Snapdragon NPU", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an encoder-decoder model be optimized on a mobile NPU for memory-bound autoregressive decoding?", "chain_ids": ["mobile-chain-auto-secondary-013-08"], "chain_positions": {"mobile-chain-auto-secondary-013-08": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1411", "title": "Optimizing Sub-4-bit LLM Deployment on Google Tensor G3", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize a sub-4-bit LLM deployment on Tensor G3 for memory-bandwidth-bound decoding?", "chain_ids": ["mobile-chain-auto-secondary-013-14"], "chain_positions": {"mobile-chain-auto-secondary-013-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1412", "title": "NPU Preemption and CPU Fallback Optimization", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design NPU preemption handling and CPU fallback without crashing or stalling the user experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1413", "title": "Distillation Projection Layer Bottlenecks", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are distillation projection layers a bottleneck on the NPU, and what student design avoids them?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1414", "title": "Optimizing KV-Cache Eviction for On-Device LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you cap and evict KV-cache entries for an on-device LLM while preserving useful context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1415", "title": "Zero-Copy Memory Mapping for Hexagon NPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory bottleneck and propose an optimization to eliminate the initialization spike?", "chain_ids": ["mobile-chain-auto-secondary-014-05"], "chain_positions": {"mobile-chain-auto-secondary-014-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1416", "title": "Mitigating Jetsam Evictions on A17 Pro NE", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory bottleneck and quantify an optimization strategy to prevent OOM evictions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1417", "title": "CI/CD Hardware Fallback on Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should CI/CD catch Tensor G3 TPU fallbacks before release, and what is the latency cost of this failure?", "chain_ids": ["mobile-chain-auto-secondary-006-18"], "chain_positions": {"mobile-chain-auto-secondary-006-18": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-18": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1418", "title": "CoreML Fallback Memory Transfer Bottleneck", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you eliminate CoreML CPU fallback for a custom attention operator on the A17 Pro Neural Engine?", "chain_ids": ["mobile-chain-auto-001-02"], "chain_positions": {"mobile-chain-auto-001-02": 1}, "chain_tiers": {"mobile-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1419", "title": "A17 Pro Unified Memory LLM Bottleneck", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the primary memory bottleneck for inference, why does an FP16 deployment fail, and what optimization is required?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1420", "title": "Hexagon NPU Telemetry Bottleneck", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign Hexagon NPU telemetry to avoid per-frame memory and CPU overhead on a Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1421", "title": "Dual-Core NPU Operator Scheduling for Memory Contention Mitigation", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should dual-core Exynos NPU operators be scheduled to reduce shared memory contention, and which operators should be parallelized?", "chain_ids": ["mobile-chain-auto-secondary-012-07"], "chain_positions": {"mobile-chain-auto-secondary-012-07": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1422", "title": "A17 Pro ANE Memory Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the A17 Pro Neural Engine reach only 5 TOPS of 35 TOPS, and what latency gain should layout fusion deliver?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1423", "title": "Structured Pruning for Apple A17 Pro Neural Engine", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does unstructured pruning fail to speed up inference, and what structured approach should replace it?", "chain_ids": ["mobile-chain-auto-secondary-006-31"], "chain_positions": {"mobile-chain-auto-secondary-006-31": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1424", "title": "Optimizing On-Device Toxicity Guardrails for Gemini Nano", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the guardrail bottleneck and quantify an optimized execution strategy on a mobile TPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1425", "title": "Deterministic Execution on A17 Pro", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you make inference deterministic despite unified-memory contention from other subsystems?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1426", "title": "Optimizing High-Frequency Sensor Ingestion on A17 Pro", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the high-frequency IMU ingestion be optimized to unblock video processing and stay within a 5W envelope?", "chain_ids": ["mobile-chain-auto-secondary-012-11"], "chain_positions": {"mobile-chain-auto-secondary-012-11": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1427", "title": "Progressive Rollout of Gemini Nano A/B Experiment", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs between a strict 1% shadow deployment versus a phased geographic rollout, and quantify the resource thresholds that would trigger an automatic rollback?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1428", "title": "Hexagon NPU Sizing for On-Device LLM", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a mobile NPU be evaluated for batch-1 on-device LLM inference to meet the 40 tokens/sec SLA?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1429", "title": "On-device LLM Guardrail Sizing for Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare an auxiliary guardrail classifier with a longer system prompt for memory, TTFT, and compute on Tensor G3?", "chain_ids": ["mobile-chain-auto-secondary-009-01"], "chain_positions": {"mobile-chain-auto-secondary-009-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1430", "title": "GQA Sizing for On-Device 3B LLMs", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Grouped-Query Attention (GQA) be sized to reduce the KV-cache footprint compared to Multi-Head Attention (MHA)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1431", "title": "Sizing Inverted Residuals for Hexagon NPU", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should inverted residual expansion ratios be chosen to hit a strict 15ms latency budget?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1432", "title": "Estimating Real-Time Video Segmentation Compute on A17 Pro", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can real-time video segmentation fit within the Apple A17 Pro compute and memory bandwidth budget without thermal throttling?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1433", "title": "On-Device LLM Coreset Sizing for Tensor G3", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you size the coreset and select examples so LoRA fine-tuning finishes within a 1-hour charging window?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1434", "title": "On-Device Data Validation Pipeline on Snapdragon 8 Gen 3", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you split on-device data validation between CPU and Snapdragon 8 Gen 3 NPU considering memory and compute constraints?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1435", "title": "On-Device Active Learning Data Selection", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a mobile device score a 30 FPS 1080p stream for active learning without disrupting foreground apps and shared memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1436", "title": "DP-SGD Memory Constraints on Exynos NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should DP-SGD micro-batches or ghost clipping be sized for NPU memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1437", "title": "On-Device Streaming Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you perform streaming drift detection on-device using NPU embeddings and CPU histograms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1438", "title": "Sizing On-Device Translation Architectures", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an on-device translation architecture be sized for NPU memory-bandwidth limits?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1439", "title": "Energy Profiling for NPU Operator Selection", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you choose between lookup tables and polynomial approximations using NPU energy consumption principles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1440", "title": "Sizing W4A8 LLM Deployment on a Flagship Phone", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you deploy a 7B LLM with W4A8 quantization under a 4 GB RAM limit, and what is its impact?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1441", "title": "On-Device LLM Intersectional Fairness Sizing", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you size an on-device LLM fairness evaluation so subgroup analysis does not exceed memory or latency budgets?", "chain_ids": ["mobile-chain-auto-secondary-013-16"], "chain_positions": {"mobile-chain-auto-secondary-013-16": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1442", "title": "On-Device LoRA with Apple A17 Pro", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you run federated LoRA updates for a 1.5B model on a mobile device within memory limits without degrading the battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1443", "title": "Thermal Degradation for Video Segmentation on A17 Pro", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a degradation ladder that transitions to a fallback state without dropping frames or exhausting unified memory?", "chain_ids": ["mobile-chain-auto-secondary-012-03"], "chain_positions": {"mobile-chain-auto-secondary-012-03": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1444", "title": "Exynos 2400 NPU Operator Lowering Tradeoffs", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should unsupported attention operators be lowered or refactored for Exynos 2400 NPU delegation to prevent severe memory bandwidth bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1445", "title": "Distilling LLMs for Tensor G3 TPU Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you distill an LLM for Tensor G3 TPU deployment using hardware-friendly student architecture choices?", "chain_ids": ["mobile-chain-auto-secondary-014-20"], "chain_positions": {"mobile-chain-auto-secondary-014-20": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1446", "title": "KV-Cache Sizing for On-Device LLMs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design a KV-cache strategy and calculate the maximum context length for a 1.8B LLM within a 1.5 GB background memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1447", "title": "mmap-Driven LLM Streaming on iPhone 15 Pro", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate using memory-mapped files (mmap) to execute this model without being terminated by the OS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1448", "title": "Video Segmentation Memory Sizing on Exynos 2400", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the memory management strategy, address fragmentation, and size the buffers to ensure stable execution within a 500 MB budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1449", "title": "On-Device LLM Precision Strategy for Tensor G3", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate precision formats and decide on a deployment recipe that fits the 3.5 GB limit while maximizing throughput?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1450", "title": "CI/CD Release Gates for Hexagon NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What CI/CD release gates should verify Hexagon NPU memory bandwidth, SRAM use, and CPU fallback risk to guarantee a 5ms SLA?", "chain_ids": ["mobile-chain-auto-secondary-006-21"], "chain_positions": {"mobile-chain-auto-secondary-006-21": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1451", "title": "Exynos 2400 NPU Delegation and Operator Fallback", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you rewrite an Exynos 2400 model graph to avoid NPU delegation fallback?", "chain_ids": ["mobile-chain-auto-001-04"], "chain_positions": {"mobile-chain-auto-001-04": 1}, "chain_tiers": {"mobile-chain-auto-001-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1452", "title": "Sizing an LLM for Exynos 2400 NPU Deployment", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a 7B LLM be sized for Exynos 2400 using quantization and KV-cache budgeting?", "chain_ids": ["mobile-chain-auto-027-11"], "chain_positions": {"mobile-chain-auto-027-11": 1}, "chain_tiers": {"mobile-chain-auto-027-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1453", "title": "On-Device Telemetry Budgeting for A17 Pro Neural Engine", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you budget on-device telemetry for A17 Pro without sending raw logs or exceeding battery limits?", "chain_ids": ["mobile-chain-auto-secondary-006-24"], "chain_positions": {"mobile-chain-auto-secondary-006-24": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1454", "title": "Hardware-Aware NAS for Hexagon NPU Realization", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS search space to ensure the resulting architecture stays compute-bound without hitting NPU memory bandwidth limits?", "chain_ids": ["mobile-chain-auto-secondary-010-13"], "chain_positions": {"mobile-chain-auto-secondary-010-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1455", "title": "Evaluate A17 Pro Power Budgeting for Real-Time Video AI", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should real-time video AI be power-budgeted on A17 Pro to avoid sustained thermal throttling?", "chain_ids": ["mobile-chain-bucket-powerbud-04"], "chain_positions": {"mobile-chain-bucket-powerbud-04": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1456", "title": "NPU Memory Contention in Shared LPDDR5X Systems", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you evaluate the memory trace to identify the bottleneck, and what architectural adjustments do you make to hit the latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1457", "title": "NPU Sparsity Optimization for 4K Video", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What are the tradeoffs between applying 50% unstructured pruning versus 2:4 structured sparsity to achieve the 30 FPS target on the mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1458", "title": "Quantization Sizing for Exynos 2400 Shared Memory Budget", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What quantization plan fits a generative model into the Exynos 2400 shared memory budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1459", "title": "Sizing On-Device Guardrails on Hexagon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you size on-device guardrail inference on Hexagon NPU for low-latency generated text checks?", "chain_ids": ["mobile-chain-auto-secondary-011-13"], "chain_positions": {"mobile-chain-auto-secondary-011-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1460", "title": "On-Device LLM Roofline Analysis on Tensor G3", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Using the roofline model, evaluate whether autoregressive decoding (batch size 1) is compute-bound or memory-bound, and predict the impact of INT4 weight-only quantization?", "chain_ids": ["mobile-chain-bucket-roofline-04"], "chain_positions": {"mobile-chain-bucket-roofline-04": 1}, "chain_tiers": {"mobile-chain-bucket-roofline-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1461", "title": "Deterministic Driver Monitoring on Exynos 2400", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you make the driver monitoring deterministic with watchdogs and pinned NPU resources so it never misses a deadline?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1462", "title": "Real-Time Sensor Fusion Ingestion Sizing", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you size real-time sensor fusion ingestion using shared memory ring buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1463", "title": "Sustained Thermal Budgeting for Continuous LLMs", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you sustain continuous LLM audio processing within Tensor G3 thermal limits?", "chain_ids": ["mobile-chain-auto-secondary-013-21"], "chain_positions": {"mobile-chain-auto-secondary-013-21": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1464", "title": "Sizing KV-Cache for On-Device LLMs on Tensor G3", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should KV-cache and model weights be sized for a Tensor G3 on-device LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1465", "title": "Exynos 2400 Shared Memory Budgeting", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Exynos 2400 shared memory be allocated for a 3B LLM and its KV cache to support a 2048-token context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1466", "title": "Canary Release Definition for Tensor G3", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is a canary release for Tensor G3 model deployment and why does it reduce rollout risk?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1467", "title": "Hexagon NPU INT8 Peak Performance Recall", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the advertised peak theoretical INT8 compute capacity of the Hexagon NPU for roofline modeling?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1468", "title": "Hardware-Based Model Extraction via TPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the specific term for this class of hardware-based security attack?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1469", "title": "MobileNet Depthwise Separable Convolution Recall", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific type of convolution operation, first popularized by the MobileNetV1 architecture, factorizes a standard convolution into two separate layers to drastically reduce computational cost?", "chain_ids": ["mobile-chain-auto-secondary-002-13"], "chain_positions": {"mobile-chain-auto-secondary-002-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1470", "title": "A17 Pro Neural Engine Specification Recall", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the key Apple A17 Pro Neural Engine compute and memory specifications for mobile ML sizing?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1471", "title": "Define Coresets for On-Device Fine-Tuning", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What ML term describes a small representative subset for Tensor G3 on-device personalization under tight memory limits?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1472", "title": "Hexagon NPU Peak Throughput Data Type", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What specific numerical data type must the input schema validate against to natively utilize the peak 45 TOPS capability of this NPU?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1473", "title": "On-Device Curation Memory Limit", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the total unified memory capacity shared between the CPU, GPU, and Neural Engine on the Apple A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1474", "title": "Hexagon NPU Capacity for Drift Detection", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What Hexagon NPU compute and shared-memory capacities are critical constraints for background drift detection?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1475", "title": "Encoder vs Decoder Bottlenecks on A17 Pro", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "By definition, which phase (encoder prefill or decoder generation) is typically compute-bound, and which is memory-bandwidth-bound?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Encoder prefill is compute-bound; decoder generation is memory-bandwidth-bound.", "Encoder prefill is memory-bandwidth-bound; decoder generation is compute-bound.", "Both phases are heavily compute-bound due to the 35 TOPS Neural Engine.", "Both phases are heavily memory-bandwidth-bound due to the unified memory pool."], "correct_index": 0}}, {"id": "mobile-1476", "title": "Definition of Equalized Odds for On-Device Models", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the exact definition of equalized odds in the context of fairness evaluation?", "chain_ids": ["mobile-chain-auto-secondary-013-16"], "chain_positions": {"mobile-chain-auto-secondary-013-16": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1477", "title": "A17 Pro Unified Memory Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the total capacity of the shared unified memory available across the CPU, GPU, and 16-core Neural Engine on the A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-012-03"], "chain_positions": {"mobile-chain-auto-secondary-012-03": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1478", "title": "Recall Knowledge Distillation Basics for Tensor G3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is knowledge distillation and how does it train a smaller Tensor G3 student model?", "chain_ids": ["mobile-chain-auto-secondary-014-20"], "chain_positions": {"mobile-chain-auto-secondary-014-20": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1479", "title": "Tensor G3 KV Cache Paged Allocation", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific memory management technique stores KV cache in non-contiguous physical blocks to eliminate external fragmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1480", "title": "Apple A17 Pro Unified Memory Recall", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific memory architecture does the A17 Pro use to share its 8 GB pool between the CPU, GPU, and Neural Engine?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1481", "title": "Exynos 2400 Unified Memory Capacity Recall", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the total shared LPDDR5X memory capacity on the Exynos 2400 that the NPU shares with the CPU and GPU?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1482", "title": "Google Tensor G3 TPU 16-bit Format Recall", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the name of this format, and how many bits does it allocate to the exponent versus the fraction?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1483", "title": "Exynos 2400 Shared Memory Model Sizing", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How does shared LPDDR5X memory on Exynos 2400 affect the maximum feasible on-device model size?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1484", "title": "Tensor G3 Memory Constraints for Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the total available system RAM on the Google Tensor G3, and what specific on-device LLM is this platform optimized to run concurrently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1485", "title": "Exynos 2400 NPU Shared Memory Architecture", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which shared-memory metrics should be monitored to profile Exynos 2400 NPU bottlenecks?", "chain_ids": ["mobile-chain-auto-secondary-009-05"], "chain_positions": {"mobile-chain-auto-secondary-009-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1486", "title": "Unstructured vs Structured Pruning on Exynos 2400 NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Why does unstructured pruning usually fail to speed up inference on mobile NPUs, and when does structured pruning help?", "chain_ids": ["mobile-chain-auto-secondary-006-34"], "chain_positions": {"mobile-chain-auto-secondary-006-34": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1487", "title": "Edge Guardrail NPU Performance Recall", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To properly document the hardware assumptions and latency constraints for the edge deployment, what is the advertised peak INT8 performance of this specific NPU class?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1488", "title": "Hardware Safety Mechanism Recall", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the standard industry term for this hardware safety mechanism?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1489", "title": "Exynos 2400 Shared Memory Architecture", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the hardware specifications of modern mobile SoCs, what type of memory architecture does the NPU use to access streaming data alongside the CPU and GPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1490", "title": "Google Tensor G3 TPU Peak Performance Recall", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For burst compute tasks before thermal throttling engages, what is the specified peak performance in TOPS for the Google Tensor G3's on-device TPU?", "chain_ids": ["mobile-chain-auto-secondary-013-21"], "chain_positions": {"mobile-chain-auto-secondary-013-21": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1491", "title": "Shadow Deployment Design for Exynos 2400", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify an Exynos 2400 shadow deployment that protects latency and memory headroom without causing memory starvation or thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1492", "title": "Designing On-Device Defenses for Exynos 2400 NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design on-device adversarial defenses for the NPU without blowing the latency or memory bandwidth budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1493", "title": "Designing Mobile Attention for Hexagon NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an attention architecture specification that fits within the NPU's capabilities and memory limits while maintaining acceptable generation quality?", "chain_ids": ["mobile-chain-auto-secondary-010-10"], "chain_positions": {"mobile-chain-auto-secondary-010-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1494", "title": "Real-Time Object Detection on Tensor G3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a real-time object detector for Tensor G3 that balances compute and memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1495", "title": "On-Device Inference Cost on Snapdragon 8 Gen 3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate on-device inference cost on Snapdragon 8 Gen 3 from compute and memory limits to meet a 20 tokens/sec requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1496", "title": "On-Device Coreset Selection for Image Personalization", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Design a data pruning and selection pipeline that maximizes the Information-Compute Ratio (ICR) without causing memory exhaustion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1497", "title": "On-Device Real-Time Video Pipeline Design for A17 Pro", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a real-time A17 Pro video pipeline using zero-copy unified memory to meet these requirements without excessively draining the battery?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 2}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1498", "title": "On-Device Data Quality Gates for LLMs", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design quality gates to filter out random pocket-dials, PII, and gibberish without degrading device performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1499", "title": "Data Curation for INT8 Hexagon NPU Calibration", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you curate an INT8 calibration dataset that captures long-tail edge cases considering the shared 12-16 GB LPDDR5X memory?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 1}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1500", "title": "On-Device DP-SGD Runtime for A17 Pro Text Prediction", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify on-device DP-SGD text prediction on A17 Pro with private clipping and noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1501", "title": "On-Device Drift Detection for Translation Models", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a privacy-preserving drift detection system for Tensor G3 translation models using local embeddings and federated analytics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1502", "title": "On-Device Summarization Architecture Selection for Snapdragon 8 Gen 3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which architecture should Snapdragon 8 Gen 3 use for long-input, short-output summarization and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1503", "title": "Energy-Aware Memory Access Design on Tensor G3", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an inference specification that minimizes energy per operation, specifically focusing on memory access versus compute?", "chain_ids": ["mobile-chain-auto-secondary-013-13"], "chain_positions": {"mobile-chain-auto-secondary-013-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1504", "title": "Federated Learning Specification for Exynos 2400", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a federated learning client specification that respects shared memory and thermal constraints on the Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1505", "title": "Real-Time Translation Degradation on Snapdragon 8 Gen 3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a real-time translation degradation ladder on Snapdragon 8 Gen 3 to guarantee continuous availability under severe thermal or memory constraints?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1506", "title": "Exynos 2400 NPU Distillation Specification", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you specify distillation for an Exynos 2400 NPU-friendly student model?", "chain_ids": ["mobile-chain-auto-secondary-014-21"], "chain_positions": {"mobile-chain-auto-secondary-014-21": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1507", "title": "Memory-Mapped NPU Inference Design on Snapdragon 8 Gen 3", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a memory-mapped inference system to achieve zero-copy weight loading and avoid redundant memory allocations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1508", "title": "A17 Pro Memory Specification for On-Device LLM", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an A17 Pro on-device LLM memory plan handle weights, KV cache, and activations given the OS and background apps consume a baseline of 3.5 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1509", "title": "Mixed-Precision Video Super-Resolution on Exynos 2400", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you specify mixed-precision (INT8/FP16) video super-resolution on Exynos 2400 to maintain fidelity without exceeding the 34.7 TOPS budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1510", "title": "CI/CD Pipeline Design for On-Device LLM on Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What CI/CD checks catch LLM latency, memory, and consistency regressions before OTA release?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1511", "title": "CoreML Conversion and ANE Delegation Strategy", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert a model to CoreML while avoiding NPU delegation fallbacks, keeping execution within the thermal budget?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 3}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1512", "title": "On-Device LLM Sizing for Apple A17 Pro", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a 7B LLM be sized and partitioned to fit within the 8 GB unified memory while efficiently leveraging the Neural Engine?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 1}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1513", "title": "Dual-Core NPU Operator Scheduling for Memory Contention", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should dual-core Exynos NPU scheduling interleave compute-bound and memory-bound operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1514", "title": "Profiling ANE Bottlenecks for Real-Time Video Segmentation", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile A17 Pro Neural Engine bottlenecks for real-time video segmentation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1515", "title": "Designing a Sparsity Strategy for A17 Pro ANE", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What pruning structure gives real A17 Pro ANE speedups for a 1.2B vision-language model at 30 FPS and 2W?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1516", "title": "Architecting for the Exynos 2400 NPU Roofline", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect a 60 FPS real-time segmentation model for an NPU to remain compute-bound under shared-memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1517", "title": "Designing a Resilient Pedestrian Detection System", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design watchdogs and fallback inference for a mobile pedestrian-alert model with a 100 ms fail-soft deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1518", "title": "Real-Time Sensor Ingestion for AR Tracking", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you ingest AR camera and IMU streams in real time using unified-memory ring buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1519", "title": "Continuous Video Analytics Thermal Throttling Design", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design continuous video analytics to maintain sustained performance under thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1520", "title": "LLM Memory Budget Specification for A17 Pro", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Under stated 7B LLM assumptions, how should the 5 GB A17 Pro memory budget be split across weights, KV-cache, and activations?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1521", "title": "Roofline Analysis for Mobile AI: Optimizing Real-time Object Detection on Apple A17 Pro", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use roofline analysis to diagnose this performance bottleneck and optimize real-time object detection?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 3}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1522", "title": "Roofline Analysis for Mobile AI Accelerator: Optimizing a Vision Model on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use Roofline Analysis to diagnose whether the model is compute- or memory-bound and propose optimizations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1523", "title": "Exynos 2400 NPU Roofline Bound for Generative Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would roofline analysis classify an Exynos 2400 NPU generative workload as compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1524", "title": "Roofline Analysis on Apple A17 Pro: Identifying Bottlenecks", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using the Roofline Model, how would you diagnose whether this poor utilization is due to compute-bound or memory-bound limitations?", "chain_ids": ["mobile-chain-bucket-roofline-01"], "chain_positions": {"mobile-chain-bucket-roofline-01": 2}, "chain_tiers": {"mobile-chain-bucket-roofline-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1525", "title": "Roofline Analysis for MobileNetV3 on Google Tensor G3", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using roofline math, is a 100 GOPS model moving 1GB per inference compute-bound or memory-bound, and how would you optimize it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1526", "title": "Optimizing Mobile LLMs: Snapdragon 8 Gen 3 NPU Capabilities", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the architectural advantages and disadvantages of using the Hexagon NPU (45 TOPS) for a 7B INT4/INT8 LLM compared to the device's GPU or CPU?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1527", "title": "Mobile AI Inference Sizing on Snapdragon NPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should mobile AI inference on an NPU be sized using empirical profiling beyond peak TOPS?", "chain_ids": ["mobile-chain-auto-secondary-007-10"], "chain_positions": {"mobile-chain-auto-secondary-007-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-007-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1528", "title": "A17 Pro Neural Engine Inference Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the theoretical maximum inference throughput of the A17 Pro Neural Engine for the model?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1529", "title": "Tensor G3 On-Device Inference Costing", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you cost on-device inference across profiling, quantization, and memory limits for a 500 GFLOP LLM?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1530", "title": "Mobile AI Inference Cost on Exynos 2400", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you calculate Exynos 2400 NPU utilization and memory bandwidth for a mobile AI workload?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1531", "title": "Snapdragon 8 Gen 3 NPU Inference Cost Optimization", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate FLOPs, NPU-hours, and subsequent energy costs for an INT8 LLM on the Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-004-08"], "chain_positions": {"mobile-chain-auto-secondary-004-08": 2}, "chain_tiers": {"mobile-chain-auto-secondary-004-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1532", "title": "Evaluating Model Deployment on Samsung Exynos 2400 NPU", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Evaluate which architecture is more suitable and quantify the maximum sustainable FPS for each model on the given hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1533", "title": "Google Tensor G3 Unified Memory Inference Budget", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do model weights, intermediate activations, and the KV-cache consume and impact a 12 GB LPDDR5X inference budget?", "chain_ids": ["mobile-chain-auto-secondary-012-14"], "chain_positions": {"mobile-chain-auto-secondary-012-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1534", "title": "Optimizing LLM Memory Footprint on Apple A17 Pro for On-Device Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a strategy to fit this model within the memory constraints while maximizing inference speed and minimizing power consumption?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1535", "title": "Diagnosing OOM on Tensor G3: VRAM Budgeting for Large Models", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the most likely culprits for the OOM error, considering all major memory components?", "chain_ids": ["mobile-chain-auto-secondary-012-14"], "chain_positions": {"mobile-chain-auto-secondary-012-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1536", "title": "Mobile LLM Memory Budgeting for 12 GB Devices", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you budget mobile model memory across weights, activations, and training state to fit within 12 GB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1537", "title": "VRAM Budgeting for 7B LLM Inference on Apple A17 Pro", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you budget a 7B LLM for Apple A17 Pro unified memory and explicitly account for KV-cache growth?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1538", "title": "VRAM Budgeting for On-Device LLM Inference on Google Tensor G3", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline your approach to estimate and manage the VRAM budget effectively?", "chain_ids": ["mobile-chain-auto-secondary-012-14"], "chain_positions": {"mobile-chain-auto-secondary-012-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1539", "title": "On-Device LLM Deployment: Apple A17 Pro VRAM Budgeting for Inference", "topic": "vram-budgeting", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you budget for model weights, activations, and the KV-cache for a maximum sequence length of 2048 tokens to balance memory efficiency versus performance?", "chain_ids": ["mobile-chain-auto-secondary-012-13"], "chain_positions": {"mobile-chain-auto-secondary-012-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1540", "title": "Apple A17 Pro KV-Cache Sizing and Memory Pressure for LLM Inference", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does KV-cache sizing impact overall system memory availability for other applications and the operating system?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 1}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1541", "title": "KV-Cache Memory Optimization on Snapdragon 8 Gen 3 for Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the memory pressure causes and propose strategies to optimize KV-cache management for the Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1542", "title": "On-Device LLM KV-Cache Optimization for Google Tensor G3", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a KV-cache management strategy that optimizes inference performance and mitigates memory pressure on the 12GB constraint for long contexts?", "chain_ids": ["mobile-chain-bucket-kvcachem-03"], "chain_positions": {"mobile-chain-bucket-kvcachem-03": 2}, "chain_tiers": {"mobile-chain-bucket-kvcachem-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1543", "title": "Diagnosing KV-Cache Memory Pressure on Apple A17 Pro", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose KV-cache memory pressure on Apple A17 Pro during long-context LLM use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1544", "title": "Optimizing KV-Cache on Snapdragon 8 Gen 3 for LLMs with Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive KV-cache management strategy for a 7B LLM on a mobile NPU to efficiently support 4096-token contexts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1545", "title": "KV-Cache Pressure on Google Tensor G3 for Long Contexts", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a mobile LLM manage KV cache for 2048-token context to avoid latency spikes and OOMs?", "chain_ids": ["mobile-chain-bucket-kvcachem-03"], "chain_positions": {"mobile-chain-bucket-kvcachem-03": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1546", "title": "KV-Cache Optimization for Large Language Models on Edge NPUs", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an edge NPU KV-cache strategy balancing paged allocation and pre-allocation under strict memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1547", "title": "Optimizing Large Language Model Deployment on Apple A17 Pro for Cold Start and Shared Memory", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a strategy to leverage memory-mapped files for efficient weight loading, shared memory across processes, and techniques to mitigate cold start issues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1548", "title": "Cold Start Optimization for Large Generative Models on Apple A17 Pro", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a memory-mapping strategy to minimize cold start latency for a large generative model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1549", "title": "Optimizing LLM Inference with Memory-Mapped Weights on Samsung Exynos 2400 NPU", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize memory-mapped LLM weights on Exynos 2400 to reduce cold-start latency and efficiently share memory across processes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1550", "title": "Designing for Memory Pressure on Google Tensor G3", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the LLM for real-time inference while avoiding OOM errors and minimizing fragmentation?", "chain_ids": ["mobile-chain-auto-secondary-014-24"], "chain_positions": {"mobile-chain-auto-secondary-014-24": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-24": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1551", "title": "Memory Optimization for 7B LLM Inference on Samsung Exynos 2400 NPU", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What minimum integer quantization bit-width is required to fit the weights, and what is the KV cache size for 128 tokens?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1552", "title": "Optimizing Large Generative AI Model Deployment on Samsung Exynos 2400 NPU under Memory Constraints", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate host offloading versus activation recomputation to fit a 15GB peak memory LLM into the Exynos 2400's 12GB RAM?", "chain_ids": ["mobile-chain-auto-secondary-014-22"], "chain_positions": {"mobile-chain-auto-secondary-014-22": 3}, "chain_tiers": {"mobile-chain-auto-secondary-014-22": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1553", "title": "Optimizing Large Language Models on Google Tensor G3 for Mobile", "topic": "memory-pressure-management", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize large language model deployment on Tensor G3 for mobile memory and latency?", "chain_ids": ["mobile-chain-auto-secondary-014-24"], "chain_positions": {"mobile-chain-auto-secondary-014-24": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1554", "title": "Analyzing On-Device LLM Latency on Google Tensor G3", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you decompose on-device LLM latency on Tensor G3 into TTFT, TPOT, and processing overheads?", "chain_ids": ["mobile-chain-auto-019-03"], "chain_positions": {"mobile-chain-auto-019-03": 3}, "chain_tiers": {"mobile-chain-auto-019-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1555", "title": "Apple A17 Pro On-Device ML Latency Decomposition for Real-Time AR", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you decompose Apple A17 Pro ML latency for a real-time AR pipeline to find bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1556", "title": "Latency Decomposition for Mobile ML on Snapdragon NPU", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile and decompose latency components to determine which architecture offers better real-time performance?", "chain_ids": ["mobile-chain-auto-019-04"], "chain_positions": {"mobile-chain-auto-019-04": 2}, "chain_tiers": {"mobile-chain-auto-019-04": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1557", "title": "Optimizing On-Device ML Inference Latency on Samsung Exynos 2400 NPU for Mobile Applications", "topic": "latency-decomposition", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you decompose and optimize end-to-end Exynos 2400 NPU inference latency for a mobile app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1558", "title": "Optimizing Real-Time Semantic Segmentation Latency on Samsung Exynos 2400 NPU", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline specific implementation strategies focusing on both compute and memory to achieve consistent sub-33ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1559", "title": "Real-Time ML Inference on Apple A17 Pro: Frame Budgeting for Jank Prevention", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the ML inference pipeline to strictly adhere to the 16.67ms frame budget on the A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1560", "title": "Real-Time ML Inference and WCET Analysis on Snapdragon NPU for AR/VR", "topic": "real-time-deadlines", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you perform WCET (Worst-Case Execution Time) analysis for real-time ML inference on the NPU to prevent AR jank?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1561", "title": "Diagnosing ML Model Latency on Google Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically diagnose whether the primary bottleneck is compute, memory, or I/O bound on this specific hardware platform?", "chain_ids": ["mobile-chain-auto-secondary-009-03"], "chain_positions": {"mobile-chain-auto-secondary-009-03": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1562", "title": "Profiling Latency in a Mobile NPU Application", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What specific metrics would you monitor and how would you interpret the data to distinguish between compute, memory, and I/O bound issues on an NPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1563", "title": "Optimizing Real-Time ML Inference on Apple A17 Pro for Low Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile this application on an Apple Neural Engine to identify bottlenecks and achieve the 30ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1564", "title": "Optimizing a Transformer Model on Snapdragon Hexagon NPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and optimize a transformer model bottleneck on a mobile NPU to reach the 100ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1565", "title": "Real-time Semantic Segmentation on Exynos 2400: Latency Bottleneck Design", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you profile and identify latency bottlenecks to preemptively meet the 30ms constraint on the Exynos 2400?", "chain_ids": ["mobile-chain-auto-secondary-009-05"], "chain_positions": {"mobile-chain-auto-secondary-009-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-009-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1566", "title": "Optimizing Object Detection Latency on Apple A17 Pro", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use mobile profiling tools to find why an object detector takes 60ms instead of the 33ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1567", "title": "Optimizing Real-time ML Inference Latency on Snapdragon NPU", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What specific profiling tools and techniques would you employ to pinpoint the exact causes of latency spikes and evaluate the two architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1568", "title": "Optimizing On-Device Object Detection Latency on Tensor G3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize on-device object detection latency using profiling and model changes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1569", "title": "Optimizing a Vision Model on Samsung Exynos 2400 NPU for Low Latency", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically profile to determine if the bottleneck is compute-bound, memory-bound, or I/O-bound, and identify the root cause?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1570", "title": "Optimizing Real-time Object Detection Latency on Apple A17 Pro", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which system metrics and profiling tools matter most when 80ms average inference latency peaks catastrophically to 200ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1571", "title": "Designing an INT8 Quantization Strategy for On-Device Object Detection on Google Tensor G3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an INT8 quantization strategy for Tensor G3 on-device object detection to meet <50ms latency and <200MB footprint constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1572", "title": "LLM Quantization Strategy for Mobile NPU Throughput", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive quantization strategy to meet these quantitative memory and throughput constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1573", "title": "Optimizing LLM Deployment on Apple A17 Pro: Quantization Strategy Deep Dive", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Describe your strategy for quantizing this LLM to achieve real-time inference while minimizing memory footprint and energy consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1574", "title": "Optimizing LLM Inference on Snapdragon 8 Gen 3 with INT4 Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize Snapdragon 8 Gen 3 LLM inference with INT4 quantization to achieve sub-200ms TPOT and reduce memory footprint?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1575", "title": "Google Tensor G3: Mixed-Precision Strategy for On-Device LLM Inference", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What are the primary considerations and trade-offs for choosing between FP16, BF16, and FP8 mixed-precision formats for LLM inference on this hardware?", "chain_ids": ["mobile-chain-auto-secondary-012-05"], "chain_positions": {"mobile-chain-auto-secondary-012-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1576", "title": "Optimizing Large Language Model Inference with Mixed-Precision on Apple A17 Pro", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which mixed-precision plan gets an FP32 LLM under 100 ms/token on an NPU within 8 GB and ~5W?", "chain_ids": ["mobile-chain-auto-secondary-012-06"], "chain_positions": {"mobile-chain-auto-secondary-012-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1577", "title": "Snapdragon Hexagon NPU Mixed-Precision LLM Deployment", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the percentage reduction in memory footprint, how many maximum parameters can fit in 12 GB using FP8, and what are the speed/accuracy trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1578", "title": "Mixed-Precision LLM Deployment on Samsung Exynos 2400 NPU", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a mixed-precision inference and fine-tuning strategy for a 7B LLM on the Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1579", "title": "Optimizing LLM Inference on Apple A17 Pro with Mixed Precision", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you optimize Apple A17 Pro LLM inference with mixed precision to meet latency and energy budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1580", "title": "INT8 Versus FP16 Quantization for Mobile LLM Inference", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which quantization approach is mandatory for this mobile deployment scenario, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1581", "title": "Optimizing Mixed-Precision Inference on Exynos 2400 NPU for Mobile CV", "topic": "mixed-precision-training", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose bottlenecks and quantify mixed-precision optimization gains for a CV model on the Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1582", "title": "Sub-4-bit LLM Deployment on Google Tensor G3", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 14 GB FP16 7B LLM be compressed below 4 bits for Tensor G3 without losing critical task accuracy?", "chain_ids": ["mobile-chain-auto-secondary-013-14"], "chain_positions": {"mobile-chain-auto-secondary-013-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1583", "title": "Extreme Sub-4-bit Quantization on Samsung Exynos 2400 NPU", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate extreme sub-4-bit quantization for an LLM on Exynos 2400 NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1584", "title": "Extreme Quantization for On-Device LLM: A17 Pro Evaluation", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate extreme quantization options for an on-device LLM on Apple A17 Pro considering power, performance, and latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1585", "title": "Deploying Sub-4-bit LLMs on Exynos 2400 NPU: Precision, Performance, and Trade-offs", "topic": "extreme-quantization", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the key trade-offs you'd consider, and how would you evaluate the effectiveness of different extreme quantization techniques specific to this hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1586", "title": "Apple A17 Pro ML Inference Power Optimization", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you choose the most energy-efficient Core ML settings for 60 FPS inference within a 2.5W ML power budget?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1587", "title": "Optimizing Quantized LLM Inference on Snapdragon 8 Gen 3 Hexagon NPU for Power Efficiency", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize quantized LLM inference on a mobile NPU for power efficiency to meet a 20 ms latency and 3W power cap?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 4}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1588", "title": "Optimizing ML Inference Power on Google Tensor G3", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize Tensor G3 ML inference power without sacrificing latency targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1589", "title": "Thermal Management and Sustained Performance of Samsung Exynos 2400 NPU", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do mobile thermal management strategies impact the sustained versus burst performance profile of an NPU during continuous inference?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1590", "title": "Optimizing LLM Inference on Snapdragon 8 Gen 3 NPU under Thermal Constraints", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Snapdragon 8 Gen 3 schedule burst and sustained LLM inference when thermal throttling lowers NPU throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1591", "title": "Diagnosing Sustained Performance Degradation on Exynos 2400 NPU", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this sustained performance degradation and propose solutions?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1592", "title": "Samsung Exynos 2400 NPU Thermal Constraints for Sustained ML Inference", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you handle NPU thermal constraints for sustained ML inference?", "chain_ids": ["mobile-chain-auto-secondary-013-22"], "chain_positions": {"mobile-chain-auto-secondary-013-22": 4}, "chain_tiers": {"mobile-chain-auto-secondary-013-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1593", "title": "Energy Optimization on Samsung Exynos 2400 NPU: Memory vs. Compute Costs", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize NPU energy by balancing memory access versus compute cost using Horowitz energy principles?", "chain_ids": ["mobile-chain-auto-secondary-013-11"], "chain_positions": {"mobile-chain-auto-secondary-013-11": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1594", "title": "Optimizing Neural Network Inference on Apple A17 Pro for Energy Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the system to minimize energy per operation, considering the Horowitz energy table, the energy cost of memory access versus compute, and energy-aware operator selection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1595", "title": "Energy Modeling for On-Device INT8 Convolution on Snapdragon Hexagon NPU", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you model INT8 convolution energy on Snapdragon Hexagon NPU across compute and memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1596", "title": "Energy-Aware LLM Deployment on Samsung Exynos 2400 NPU", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize the model's energy profile considering the Horowitz energy table principles and the disparity between NPU compute and LPDDR5X memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1597", "title": "Optimizing On-Device ML Energy Consumption on Apple A17 Pro", "topic": "energy-per-operation", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize an A17 Pro model's energy consumption considering Horowitz principles and hardware operator efficiency?", "chain_ids": ["mobile-chain-auto-secondary-013-12"], "chain_positions": {"mobile-chain-auto-secondary-013-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1598", "title": "Optimizing Transformer Inference on Samsung Exynos 2400 NPU: Attention Complexity and KV-Cache Management", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the fundamental architectural and scaling challenges of standard Transformer models that contribute to these issues on this specific mobile platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1599", "title": "Apple A17 Pro: Optimizing Transformer Inference for Low-Power Mobile Devices", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize transformer inference on Apple A17 Pro for low-power mobile deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1600", "title": "Snapdragon NPU LLM Deployment: Cost Analysis", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the memory footprint and theoretical sequence length limit for a 7B parameter LLM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1601", "title": "Optimizing Large Transformer Inference on Snapdragon 8 Gen 3 NPU", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What architectural and algorithmic optimizations would you propose to manage memory bandwidth, computational limits, and KV-cache on the NPU's INT8 capabilities and limited memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1602", "title": "A17 Pro Neural Engine and Depthwise Separable Convolutions", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should depthwise separable convolutions be used to improve mobile inference efficiency without destroying accuracy?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1603", "title": "Optimizing On-Device Object Detection for Google Tensor G3", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you redesign an on-device object detection model to meet sub-50ms latency while preserving accuracy and efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1604", "title": "MobileNetV3 Latency Anomaly on Apple A17 Pro Neural Engine", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why MobileNetV3-Large is taking about 200ms on the A17 Pro despite a low FLOP count?", "chain_ids": ["mobile-chain-auto-secondary-002-14"], "chain_positions": {"mobile-chain-auto-secondary-002-14": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1605", "title": "Optimizing Large Context Attention for Mobile NPU", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do the two attention mechanisms trade off memory bandwidth, compute efficiency, and latency on this hardware?", "chain_ids": ["mobile-chain-auto-secondary-010-10"], "chain_positions": {"mobile-chain-auto-secondary-010-10": 2}, "chain_tiers": {"mobile-chain-auto-secondary-010-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1606", "title": "Optimizing Large Language Model Attention for Apple A17 Pro", "topic": "attention-scaling", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you modify LLM attention and memory handling so an 8192-token model can run in real time on the A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1607", "title": "Model Footprint on Snapdragon Hexagon NPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate the minimum memory footprint of this model on the NPU and discuss whether it's feasible given the available memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1608", "title": "On-Device LLM Deployment Feasibility on Google Tensor G3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze whether this model can run effectively on the NPU, and what are the key considerations for achieving optimal performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1609", "title": "Mobile Model Deployment Feasibility: Samsung Exynos NPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you assess whether MobileVisionNet-L can meet 30 FPS on the Exynos 2400 NPU given memory and throughput limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1610", "title": "A17 Pro Mobile Model Memory Footprint", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Propose a simplified convolutional neural network architecture and estimate its peak memory footprint to determine if it is feasible given the 500 MB constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1611", "title": "Diagnosing Mobile LLM Out-of-Memory Deployment", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the OOM cause and decide whether the 7B model or a smaller derivative can deploy on this phone?", "chain_ids": ["mobile-chain-auto-027-09"], "chain_positions": {"mobile-chain-auto-027-09": 1}, "chain_tiers": {"mobile-chain-auto-027-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1612", "title": "LLM Deployment Feasibility on Mobile NPU", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are your immediate concerns, and how would you perform a quick estimate to determine if deploying a 7B parameter LLM on a 12 GB device is achievable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1613", "title": "Evaluating Transformer Architectures for On-Device Deployment on Apple A17 Pro", "topic": "model-size-estimation", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare the two INT8 Transformer models for feasibility on A17 Pro across memory, throughput, and power?", "chain_ids": ["mobile-chain-auto-027-10"], "chain_positions": {"mobile-chain-auto-027-10": 2}, "chain_tiers": {"mobile-chain-auto-027-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1614", "title": "Hardware-Aware NAS for Samsung Exynos 2400 NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would hardware-aware NAS use TOPS, memory limits, and MCUNet-style constraints to find a mobile architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1615", "title": "A17 Pro Hardware-Aware NAS Performance Analysis", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose why a hardware-aware NAS model still exhibits high latency and power draw despite low theoretical FLOPs?", "chain_ids": ["mobile-chain-auto-secondary-010-12"], "chain_positions": {"mobile-chain-auto-secondary-010-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-010-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1616", "title": "Hardware-Aware NAS for Real-time Mobile AR", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a hardware-aware NAS search for real-time AR segmentation on the Snapdragon 8 Gen 3 Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1617", "title": "Hardware-aware NAS for Mobile LLM Deployment on Google Tensor G3", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What memory penalty term would you add to an LLM NAS fitness function for 4-bit weights and an 8 GB peak memory limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1618", "title": "Hardware-Aware NAS for On-Device Deployment on Samsung Exynos 2400 NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you define the search space, objective, and search strategy for NAS under strict latency and memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1619", "title": "Snapdragon 8 Gen 3: Encoder-Decoder Architecture Tradeoffs for On-Device AI", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the latency, memory, and power tradeoffs among encoder-only, decoder-only, and encoder-decoder models?", "chain_ids": ["mobile-chain-auto-secondary-013-08"], "chain_positions": {"mobile-chain-auto-secondary-013-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1620", "title": "Encoder-Decoder Tradeoffs for On-Device NPU Deployment", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which encoder, decoder, or encoder-decoder architecture would you choose for an Exynos 2400 AI assistant and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1621", "title": "On-Device Real-time Translation with Apple A17 Pro: Architecture Tradeoffs", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the tradeoffs for each approach considering the A17 Pro's specific hardware constraints and the service requirements?", "chain_ids": ["mobile-chain-auto-secondary-013-09"], "chain_positions": {"mobile-chain-auto-secondary-013-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1622", "title": "Optimizing Encoder-Decoder for On-Device Mobile Deployment on Google Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a system architecture, considering the tradeoffs between encoder-only, decoder-only, and full encoder-decoder approaches, specifically for the Google Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1623", "title": "Mobile LLM Architecture Tradeoffs", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you justify your architectural choice, and what back-of-the-envelope calculations would you perform to validate its feasibility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1624", "title": "Mobile LLM Architecture Tradeoffs on Apple A17 Pro", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare a hybrid specialized-model approach against a single decoder-only LLM for an A17 Pro assistant?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1625", "title": "Comparing Mobile NLU Architectures on Google Tensor G3", "topic": "encoder-decoder-tradeoffs", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose performance bottlenecks for each architecture on the Tensor G3 and quantify costs to make your recommendation?", "chain_ids": ["mobile-chain-auto-secondary-013-10"], "chain_positions": {"mobile-chain-auto-secondary-013-10": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1626", "title": "Differentiating Pruning Techniques for Mobile ML Acceleration on Tensor G3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do unstructured and structured pruning differ for speedups, implementation complexity, and TPU friendliness?", "chain_ids": ["mobile-chain-auto-secondary-006-33"], "chain_positions": {"mobile-chain-auto-secondary-006-33": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1627", "title": "Pruning for On-Device LLM Inference on Samsung Exynos 2400 NPU", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do structured vs unstructured pruning methods align with the NPU's processing capabilities and memory bandwidth?", "chain_ids": ["mobile-chain-auto-secondary-006-34"], "chain_positions": {"mobile-chain-auto-secondary-006-34": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1628", "title": "Optimizing LLM Deployment on Snapdragon Hexagon NPU with Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use structured pruning to speed up an LLM on the Snapdragon Hexagon NPU while minimizing accuracy loss?", "chain_ids": ["mobile-chain-auto-secondary-006-32"], "chain_positions": {"mobile-chain-auto-secondary-006-32": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1629", "title": "Optimizing a Large Language Model for On-Device Deployment on Apple A17 Pro with Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you combine structured pruning and quantization to fit and accelerate a 7B LLM on the A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-006-31"], "chain_positions": {"mobile-chain-auto-secondary-006-31": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1630", "title": "Optimizing LLM Deployment on Snapdragon Hexagon NPU via Structured Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you choose between different sparsity patterns to align with the NPU's capabilities and meet memory/latency targets?", "chain_ids": ["mobile-chain-auto-secondary-006-32"], "chain_positions": {"mobile-chain-auto-secondary-006-32": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1631", "title": "Optimizing Large Language Model Inference with Structured Sparsity on Google Tensor G3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a pruning and sparsity strategy for a Tensor G3 LLM that balances accuracy, latency, and power?", "chain_ids": ["mobile-chain-auto-secondary-006-33"], "chain_positions": {"mobile-chain-auto-secondary-006-33": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1632", "title": "Knowledge Distillation for Vision Model Deployment on Snapdragon 8 Gen 3 NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Design an implementation strategy detailing distillation techniques and how you would leverage the Snapdragon 8 Gen 3 NPU specs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1633", "title": "Diagnosing Knowledge Distillation Deployment Issues on Google Tensor G3 for Mobile ML", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose intermittent latency spikes and high memory use in a distilled student model on Tensor G3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1634", "title": "Optimizing Large Language Models for Apple A17 Pro with Knowledge Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you use knowledge distillation to create an A17 Pro-ready LLM student model within memory and power limits?", "chain_ids": ["mobile-chain-auto-secondary-014-19"], "chain_positions": {"mobile-chain-auto-secondary-014-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1635", "title": "Optimizing Knowledge Distillation for Vision Models on Samsung Exynos 2400 NPU", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you explain when distillation would be preferred over pruning for this specific hardware and accuracy target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1636", "title": "Hexagon NPU Graph Compilation Analysis: Latency & Memory for Large Models", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which Hexagon NPU behaviors reveal poor operator lowering or constant folding in an INT8 AOT-compiled transformer?", "chain_ids": ["mobile-chain-auto-secondary-003-08"], "chain_positions": {"mobile-chain-auto-secondary-003-08": 1}, "chain_tiers": {"mobile-chain-auto-secondary-003-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1637", "title": "Tensor G3 On-Device ML Compiler Design for Real-time Vision", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compile and optimize a TensorFlow object detector for TFLite on Tensor G3 to sustain 60 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1638", "title": "Quantization and Operator Lowering for ViT on Exynos 2400 NPU", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the theoretical inference latency for a single image under each optimization strategy, assuming full NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1639", "title": "A17 Pro ML Compiler Design for Real-time Vision", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you leverage operator lowering, constant folding, and other compiler-driven optimizations to meet the 15ms latency and ~5W power constraints on the A17 Pro's Neural Engine?", "chain_ids": ["mobile-chain-auto-secondary-003-06"], "chain_positions": {"mobile-chain-auto-secondary-003-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-003-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1640", "title": "CustomSwish Lowering and Constant Folding on Snapdragon Hexagon NPU", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do CustomSwish lowering and constant folding change total INT8 operations and latency on the Snapdragon Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-003-08"], "chain_positions": {"mobile-chain-auto-secondary-003-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-003-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1641", "title": "Optimizing Vision Transformer for Apple A17 Pro Neural Engine", "topic": "graph-compilation", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert and optimize a dynamic PyTorch vision transformer for low-latency A17 Pro Neural Engine execution?", "chain_ids": ["mobile-chain-auto-secondary-003-06"], "chain_positions": {"mobile-chain-auto-secondary-003-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-003-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1642", "title": "Operator Scheduling: Layer Fusion on Samsung Exynos 2400 NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is layer fusion in operator scheduling, and how does it reduce latency for a CNN on an NPU?", "chain_ids": ["mobile-chain-auto-secondary-012-07"], "chain_positions": {"mobile-chain-auto-secondary-012-07": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1643", "title": "Apple A17 Pro Neural Engine: Optimizing Operator Scheduling for Low Latency Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize operator scheduling for an LLM pipeline with idle gaps and CPU-Neural Engine transfers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1644", "title": "Optimizing MobileNetV3 Operator Scheduling on Hexagon NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule depthwise, pointwise, and activation operators for low-latency MobileNetV3 execution on Hexagon NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1645", "title": "Tensor G3 Operator Scheduling for Memory and Throughput Optimization", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule and fuse OpA, OpB, and OpC on Tensor G3 to reduce peak memory versus naive execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1646", "title": "Diagnosing NPU Latency on Samsung Exynos 2400: Operator Scheduling Challenges", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the specific root causes of this performance bottleneck using NPU profiling tools?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1647", "title": "On-Device LLM Deployment on Apple A17 Pro: Operator Scheduling for Latency and Memory Optimization", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule operators in a 5B INT8 LLM on A17 Pro to meet a 50ms per token generation latency within 8 GB memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1648", "title": "Transformer Decode Block Scheduling on Hexagon NPU", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you schedule an LLM block on the NPU to reduce latency, reuse memory, exploit parallelism, and apply layer fusion to meet a 100ms latency target?", "chain_ids": ["mobile-chain-auto-secondary-012-08"], "chain_positions": {"mobile-chain-auto-secondary-012-08": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1649", "title": "Optimizing Object Detection on Google Tensor G3: Operator Scheduling for MobileNetV3-SSD vs. EfficientDet-Lite", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize execution via operator scheduling, and which model achieves better throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1650", "title": "Hexagon NPU Transformer Scheduling for LLMs", "topic": "operator-scheduling", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design transformer operator scheduling on the NPU to maximize throughput and minimize memory traffic?", "chain_ids": ["mobile-chain-auto-secondary-012-08"], "chain_positions": {"mobile-chain-auto-secondary-012-08": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1651", "title": "MLOps Artifacts for On-Device AI with Google Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What non-binary artifacts should a model registry version to keep Tensor G3 on-device AI training and serving consistent?", "chain_ids": ["mobile-chain-auto-secondary-006-18"], "chain_positions": {"mobile-chain-auto-secondary-006-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1652", "title": "MLOps Lifecycle for On-Device Deployment on Snapdragon Hexagon NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design CI/CD, monitoring, and efficiency metrics for an on-device object detector deployment?", "chain_ids": ["mobile-chain-auto-secondary-006-21"], "chain_positions": {"mobile-chain-auto-secondary-006-21": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1653", "title": "On-Device Model Drift Diagnosis on Google Tensor G3", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline your investigation steps, focusing on MLOps lifecycle elements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1654", "title": "MLOps Pipeline for Edge ML on Samsung Exynos NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive MLOps pipeline that ensures reliable and reproducible model delivery to the mobile devices?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1655", "title": "Optimizing Edge ML Deployment on Samsung Exynos NPU", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically diagnose this 15 FPS bottleneck and quantify a specific optimization toward the 30 FPS target?", "chain_ids": ["mobile-chain-auto-secondary-006-19"], "chain_positions": {"mobile-chain-auto-secondary-006-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-006-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1656", "title": "Scalable MLOps for On-Device AR on Apple A17 Pro", "topic": "mlops-lifecycle", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you build an MLOps pipeline for A17 Pro AR that preserves reproducibility, consistency, performance, and power limits?", "chain_ids": ["mobile-chain-auto-secondary-006-20"], "chain_positions": {"mobile-chain-auto-secondary-006-20": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1657", "title": "A17 Pro LLM Deployment with CoreML Operator Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you outline your strategy to address CoreML operator gaps and ensure efficient, low-latency execution within the hardware constraints?", "chain_ids": ["mobile-chain-auto-001-02"], "chain_positions": {"mobile-chain-auto-001-02": 2}, "chain_tiers": {"mobile-chain-auto-001-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1658", "title": "Diagnosing TFLite Performance Regressions on Google Tensor G3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose Tensor G3 TFLite latency spikes and accuracy shifts caused by conversion, op coverage, or delegation gaps?", "chain_ids": ["mobile-chain-auto-001-08"], "chain_positions": {"mobile-chain-auto-001-08": 0}, "chain_tiers": {"mobile-chain-auto-001-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1659", "title": "Mobile NPU Deployment for a PyTorch Object Detector", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you convert, optimize, and validate a PyTorch object detector for 30 FPS on a mobile NPU?", "chain_ids": ["mobile-chain-auto-001-09"], "chain_positions": {"mobile-chain-auto-001-09": 1}, "chain_tiers": {"mobile-chain-auto-001-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1660", "title": "Optimizing Model Conversion for Apple A17 Pro Neural Engine", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you approach this problem to maximize performance and minimize power consumption on the A17 Pro, considering its specific hardware capabilities?", "chain_ids": ["mobile-chain-auto-001-12"], "chain_positions": {"mobile-chain-auto-001-12": 0}, "chain_tiers": {"mobile-chain-auto-001-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1661", "title": "NPU Model Deployment and Optimization on Snapdragon 8 Gen 3", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design and compare the end-to-end conversion, optimization, and deployment pipelines for both models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1662", "title": "Optimizing LLM Deployment on Apple A17 Pro: CoreML Conversion & Operator Gaps", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you handle CoreML conversion gaps and CPU/GPU fallbacks to achieve real-time inference on the A17 Pro?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 5}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1663", "title": "Tensor G3 Model Rollout: Choosing a Strategy", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which rollout strategy would you use for a Tensor G3 on-device model update (7.5 TOPS TPU, 12 GB shared RAM) to safely collect metrics without degrading user battery life?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1664", "title": "Optimizing ML Model Rollouts on Mobile NPUs for a Mobile Application", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a comprehensive A/B and rollout strategy for this new model, considering the unique constraints and opportunities presented by on-device inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1665", "title": "Canary Rollout Strategy for ML Model on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you determine the initial canary traffic percentage, and what NPU-centric metrics would you monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1666", "title": "Phased Rollout for 5GB Exynos 2400 Edge Model", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you roll out a 5GB Exynos 2400 edge model with memory, thermal, A/B, telemetry, and rollback gates?", "chain_ids": ["mobile-chain-auto-secondary-011-11"], "chain_positions": {"mobile-chain-auto-secondary-011-11": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1667", "title": "Evaluating Rollout Strategies for On-Device ML Model Architectures on Snapdragon Hexagon NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you A/B test and progressively roll out two model architectures with different latency and operability to decide the final production model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1668", "title": "On-Device ML Model Canary Rollout on Google Tensor G3", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you monitor a mobile canary rollout's performance, ensure stability, and decide on a full rollout or rollback?", "chain_ids": ["mobile-chain-auto-secondary-011-10"], "chain_positions": {"mobile-chain-auto-secondary-011-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-011-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1669", "title": "Diagnosing & Quantifying Performance Bottlenecks in Canary Rollouts on Exynos 2400 NPU", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the 15% latency regression and quantify the impact of a potential fix?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1670", "title": "ML Model Rollout on Apple A17 Pro with Progressive Deployment", "topic": "ab-rollout-strategies", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the model update be progressively rolled out while monitoring latency, heat, battery, and ensuring rollback mechanisms?", "chain_ids": ["mobile-chain-auto-secondary-011-12"], "chain_positions": {"mobile-chain-auto-secondary-011-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1671", "title": "On-device PSI Calculation for Predicted Class Drift on Snapdragon NPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you efficiently store hourly predicted-class counts and compute PSI for a 10-class on-device classifier?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1672", "title": "Diagnosing Data Drift on Google Tensor G3 for On-Device ML", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you distinguish data drift, concept drift, and training-serving skew for a Tensor G3 image classifier under low-light inputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1673", "title": "On-Device Data Drift Detection for Real-time ML on Samsung Exynos 2400 NPU", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a robust, low-overhead on-device drift detector for an image model with reliable fallback triggers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1674", "title": "Drift Detection Strategies for Mobile NPU Deployments", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should Snapdragon 8 Gen 3 drift detection use server KL divergence, on-device PSI, or a hybrid design?", "chain_ids": ["mobile-chain-auto-secondary-013-06"], "chain_positions": {"mobile-chain-auto-secondary-013-06": 3}, "chain_tiers": {"mobile-chain-auto-secondary-013-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1675", "title": "Exynos NPU Drift: Optimizing On-Device Reliability", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you implement and optimize on-device drift detection without hurting primary inference latency?", "chain_ids": ["mobile-chain-auto-secondary-013-07"], "chain_positions": {"mobile-chain-auto-secondary-013-07": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1676", "title": "On-Device Drift Detection for Gesture Recognition", "topic": "distribution-drift-detection", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design on-device drift detection for gesture recognition while minimizing battery and user impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1677", "title": "Graceful Degradation for On-Device ML on Samsung Exynos 2400", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design degradation ladders, model fallbacks, fail-safe modes, and QoS shedding for Exynos 2400 on-device ML?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1678", "title": "Graceful Degradation for On-Device ML with Snapdragon NPU", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you build a graceful degradation strategy for a mobile AI assistant under thermal, battery, and load constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1679", "title": "Adaptive NLU on Tensor G3: Resource-Aware Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation strategy for the NLU system to maintain critical functionality under dynamic resource constraints?", "chain_ids": ["mobile-chain-auto-secondary-012-04"], "chain_positions": {"mobile-chain-auto-secondary-012-04": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-04": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1680", "title": "Graceful Degradation for On-Device ML on Apple A17 Pro", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation strategy for this system, detailing degradation ladders, model fallbacks, and QoS shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1681", "title": "Graceful Degradation for On-Device AR on Google Tensor G3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which architecture would you recommend and why, specifically referencing the Tensor G3's capabilities, and what specific metrics would you monitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1682", "title": "A17 Pro Vision Model Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you degrade an A17 Pro AR vision system under frame drops, latency, and battery drain while preserving core functionality?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1683", "title": "Graceful Degradation for On-Device LLM on Snapdragon 8 Gen 3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you dynamically adjust the model's output quality or inference parameters to manage NPU and memory load through QoS shedding and degradation ladders?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1684", "title": "NPU Functional Safety for ISO 26262 ASIL B", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware-software co-design elements are needed for ISO 26262 ASIL B compliance on Snapdragon Hexagon?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1685", "title": "Tensor G3 ADAS Functional Safety Analysis (ASIL-B)", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should watchdogs, self-tests, and deterministic TPU execution make a Tensor G3 ADAS warning feature ASIL-B ready?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1686", "title": "Designing a Safety-Critical ML System on Samsung Exynos 2400 NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect a safety-critical pedestrian detection system on the NPU to satisfy ISO 26262 ASIL B?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1687", "title": "Real-time Neural Engine Safety Self-Test on Apple A17 Pro for ADAS (ASIL B)", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the maximum allowable duration for the critical self-test per 50ms cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1688", "title": "Designing a Safety-Critical Edge ML System for Autonomous Mobile Robotics on Google Tensor G3", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect the ML pipeline and integrate system-level safety mechanisms to comply with functional safety principles, ensuring deterministic execution and high reliability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1689", "title": "Functional Safety for Autonomous Driving on Exynos 2400", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you implement self-tests and watchdog timers for ASIL D autonomous driving workloads on the Exynos 2400 NPU?", "chain_ids": ["mobile-chain-auto-secondary-012-09"], "chain_positions": {"mobile-chain-auto-secondary-012-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1690", "title": "Functional Safety Design for ADAS on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "safety-certification", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the NPU software architecture for ASIL-B emergency braking with deterministic execution and fault mitigation?", "chain_ids": ["mobile-chain-auto-secondary-012-10"], "chain_positions": {"mobile-chain-auto-secondary-012-10": 3}, "chain_tiers": {"mobile-chain-auto-secondary-012-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1691", "title": "A17 Pro NPU Adversarial Attack Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What on-device adversarial attack and lightweight defense would you choose for fraud detection on the A17 Pro Neural Engine?", "chain_ids": ["mobile-chain-auto-secondary-009-02"], "chain_positions": {"mobile-chain-auto-secondary-009-02": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1692", "title": "Mitigating Adversarial Attacks on On-Device ML for Google Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a robust system architecture to detect and mitigate adversarial threats on the Tensor G3 platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1693", "title": "Quantifying Adversarial Attack Impact on Mobile NPU Inference Latency", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate the latency increase from adversarial training on a MobileNetV3 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1694", "title": "Diagnosing Adversarial Impact on On-Device ML Reliability with Apple A17 Pro", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose an adversarial side-channel attack on an A17 Pro model running under high system load?", "chain_ids": ["mobile-chain-auto-secondary-009-02"], "chain_positions": {"mobile-chain-auto-secondary-009-02": 1}, "chain_tiers": {"mobile-chain-auto-secondary-009-02": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1695", "title": "Designing an Adversarially Robust Mobile ML System on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the architecture and defenses to maximize robust and secure operation within NPU constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1696", "title": "On-Device Adversarial Input Detection for Safety-Critical ML on Tensor G3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you detect and mitigate input-level adversarial attacks on a real-time object detector without impacting performance?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1697", "title": "Mobile NPU Adversarial Robustness Evaluation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you evaluate adversarial training versus randomized smoothing for robustness, latency, and memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1698", "title": "Optimizing Adversarial Robustness on Snapdragon NPU for Mobile Reliability", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the specific bottleneck causing this performance degradation and propose an optimization strategy to restore application reliability, quantifying the expected improvement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1699", "title": "On-device Adversarial Attack Mitigation for Google Tensor G3 ML Model", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you defend a Tensor G3 harmful-content classifier against imperceptible adversarial perturbations within real-time limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1700", "title": "Monitoring On-Device ML Health on Snapdragon 8 Gen 3 NPU", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What key telemetry metrics would you collect from the NPU and application to monitor runtime health and detect degradation?", "chain_ids": ["mobile-chain-auto-secondary-006-23"], "chain_positions": {"mobile-chain-auto-secondary-006-23": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1701", "title": "Quantifying Straggler Impact on A17 Pro ML Inference Latency", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compute added latency from A17 Pro inference stragglers and define a production Straggler Impact Score?", "chain_ids": ["mobile-chain-auto-secondary-006-24"], "chain_positions": {"mobile-chain-auto-secondary-006-24": 0}, "chain_tiers": {"mobile-chain-auto-secondary-006-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1702", "title": "Diagnosing Latency Spikes on Snapdragon 8 Gen 3 NPU", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose sustained-load latency spikes in a Snapdragon 8 Gen 3 video analytics model using NPU telemetry?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1703", "title": "Diagnosing Stragglers on Mobile NPU with Telemetry", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you extend monitoring to find intermittent NPU stragglers and reduce MTTR?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1704", "title": "Designing Reliable On-Device ML Monitoring for Mobile AI", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you architect low-power monitoring for an on-device AI feature to catch regressions, stragglers, drift, and failures?", "chain_ids": ["mobile-chain-auto-secondary-006-24"], "chain_positions": {"mobile-chain-auto-secondary-006-24": 2}, "chain_tiers": {"mobile-chain-auto-secondary-006-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1705", "title": "Diagnosing and Quantifying NPU Bottlenecks on Snapdragon 8 Gen 3 with Telemetry", "topic": "monitoring-observability", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you detect NPU stragglers, diagnose bottlenecks, and quantify optimization impact before rollout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1706", "title": "On-Device Real-time Sensor Fusion Pipeline for Apple A17 Pro", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a power-efficient A17 Pro sensor and camera data pipeline for immediate on-device inference?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 3}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1707", "title": "Optimizing On-Device Sensor Data Ingestion for Mobile ML", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you reduce data loading and preprocessing bottlenecks for a 100 MB/s raw sensor stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1708", "title": "Optimizing On-Device ML Data Ingestion for Real-time Video on Google Tensor G3", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the potential bottlenecks in this mobile data pipeline and propose concrete optimization strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1709", "title": "Real-time Mobile ML Data Pipeline Optimization on Exynos 2400 NPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a 4K60 Exynos 2400 data pipeline that keeps the NPU fed without dropping frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1710", "title": "On-Device Data Schema Validation for Google Tensor G3", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What lightweight data schema validation strategy ensures sensor integrity before inference without burning the CPU budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1711", "title": "Data Corruption in Edge ML Model on Samsung Exynos 2400 NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and prevent malformed camera data from degrading an Exynos 2400 object detector?", "chain_ids": ["mobile-chain-auto-secondary-012-01"], "chain_positions": {"mobile-chain-auto-secondary-012-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1712", "title": "On-Device Data Quality & Validation for Health ML on Apple A17 Pro", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design on-device data quality, lineage, and anomaly detection for a health monitoring pipeline within a 5W continuous budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1713", "title": "On-Device ML Data Quality for Samsung Exynos 2400 NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Design a comprehensive data quality and validation system for this pipeline. How would you identify and handle unexpected patterns or outliers in the sensor data stream or model inferences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1714", "title": "On-Device Data Quality for A17 Pro Mobile ML", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a robust on-device data quality pipeline for high-frequency sensor data on the Apple A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1715", "title": "On-Device ML Data Quality: Centralized vs. Edge Validation on Snapdragon NPU", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you discuss the trade-offs and recommend which validation approach is most suitable for a mission-critical mobile application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1716", "title": "On-Device Data Integrity and Schema Evolution for ML on Google Tensor G3", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you enforce evolving data contracts and on-device anomaly checks for Tensor G3 physiological sensor streams?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1717", "title": "Exynos NPU Data Contract Violations", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and redesign validation, fallback, and telemetry for NPU stalls from sensor contract violations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1718", "title": "Quantifying Inter-Annotator Agreement for Mobile NPU Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which inter-annotator agreement metric would you use for mobile dataset labels, and why is it better than simple agreement?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1719", "title": "Optimizing On-Device Image Classification Dataset with Active Learning on Apple A17 Pro", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design an active learning pipeline that minimizes on-device cost while ensuring high-quality data selection?", "chain_ids": ["mobile-chain-auto-027-15"], "chain_positions": {"mobile-chain-auto-027-15": 0}, "chain_tiers": {"mobile-chain-auto-027-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1720", "title": "On-Device Object Detection Dataset Curation for Snapdragon 8 Gen 3 NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a continuous active learning strategy to identify and mitigate biases on-device using the Snapdragon 8 Gen 3 NPU?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 2}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1721", "title": "On-Device Active Learning for Image Classification on Mobile TPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design an on-device data selection strategy and annotation workflow to efficiently integrate new labels while respecting memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1722", "title": "Diagnosing Mobile Gesture Model Bias on Exynos NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose real-world gesture false negatives as a dataset curation and labeling problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1723", "title": "On-device Active Learning for Gesture Recognition Dataset Curation on Apple A17 Pro", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a privacy-preserving on-device active learning system for A17 Pro gesture recognition data curation?", "chain_ids": ["mobile-chain-auto-027-15"], "chain_positions": {"mobile-chain-auto-027-15": 1}, "chain_tiers": {"mobile-chain-auto-027-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1724", "title": "On-Device Rare Class Data Curation for Snapdragon NPU", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How many expert annotation person-hours are needed to collect 5,000 rare-class instances at 30 seconds per image?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1725", "title": "On-Device Dataset Curation for Bias Mitigation on Snapdragon 8 Gen 3", "topic": "dataset-curation", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you leverage the NPU's capabilities for data selection, active learning, and annotation workflows to mitigate low-light and rural detection bias?", "chain_ids": ["mobile-chain-auto-027-14"], "chain_positions": {"mobile-chain-auto-027-14": 3}, "chain_tiers": {"mobile-chain-auto-027-14": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1726", "title": "Real-time Sensor Stream Processing on Apple A17 Pro", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you process high-frequency sensor streams for A17 Pro anomaly detection with low latency and low power?", "chain_ids": ["mobile-chain-auto-secondary-012-11"], "chain_positions": {"mobile-chain-auto-secondary-012-11": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1727", "title": "Optimizing Real-Time Sensor Processing on Snapdragon NPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the system behavior and explain why these issues are occurring, considering the NPU's specifications and the nature of mobile streaming data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1728", "title": "Real-time Physiological Anomaly Detection on Google Tensor G3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the stream processing and ML pipeline on the Tensor G3 to balance real-time anomaly detection with power efficiency?", "chain_ids": ["mobile-chain-auto-secondary-012-12"], "chain_positions": {"mobile-chain-auto-secondary-012-12": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1729", "title": "Real-Time Sensor Data Ingestion and Anomaly Detection on Edge NPU", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you ingest 1 kHz sensor data and run 1-second-window anomaly inference on the NPU within 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1730", "title": "Diagnosing High Latency in On-Device Streaming Feature Computation on Apple A17 Pro", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do sensor feature computations exceed 500 ms when raw ingestion is stable and the target is 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1731", "title": "Real-time Edge ML System Design for Mobile Sensor Data", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a sub-10 ms Snapdragon 8 Gen 3 sensor-ingestion pipeline for IMU, audio, and camera metadata?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1732", "title": "On-Device Sensor Stream Processing with Google Tensor G3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a streaming data pipeline to handle 1000 samples/s ingestion, pre-processing, and ML inference on the Tensor G3 architecture?", "chain_ids": ["mobile-chain-auto-secondary-012-12"], "chain_positions": {"mobile-chain-auto-secondary-012-12": 0}, "chain_tiers": {"mobile-chain-auto-secondary-012-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1733", "title": "Real-Time Sensor Fusion on Exynos 2400: Edge vs. Hybrid Processing Architectures", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare pure edge and hybrid edge-cloud sensor processing for a fitness app?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1734", "title": "Real-time Gesture Recognition on Google Tensor G3", "topic": "streaming-ingestion", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the data ingestion and feature computation pipeline to meet the strict sub-50ms end-to-end latency constraint?", "chain_ids": ["mobile-chain-auto-secondary-012-12"], "chain_positions": {"mobile-chain-auto-secondary-012-12": 2}, "chain_tiers": {"mobile-chain-auto-secondary-012-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1735", "title": "Designing On-Device Data Efficiency for Personalized ML on Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design the data selection, retention, and processing pipeline for an on-device personalized ML system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1736", "title": "Optimizing On-Device ML with Coreset Selection on Snapdragon 8 Gen 3", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What maximum coreset size fits a 1 hour retrain if each sample needs 2e9 ops total and 1 KB in a 10 GB sample cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1737", "title": "On-Device Vision Model Data Optimization for Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What measures would you implement to prevent model collapse risks during aggressive data optimization on the A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1738", "title": "Optimizing On-Device ML with Coreset Selection on Apple A17 Pro", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can coreset selection reduce the data footprint for efficient on-device personalization of an A17 Pro recommendation model?", "chain_ids": ["mobile-chain-auto-secondary-014-17"], "chain_positions": {"mobile-chain-auto-secondary-014-17": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1739", "title": "Federated Predictive Text Learning on Apple A17 Pro", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would your design orchestrate federated averaging for millions of A17 Pro devices handling non-IID data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1740", "title": "Federated LLM Personalization on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Detail the system architecture, data flow, and key algorithms, including how you would handle model updates and aggregation, given the constrained mobile environment?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 4}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1741", "title": "Federated LLM Personalization on Google Tensor G3: Scaling and Non-IID Challenges", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you 'size' the overall system to handle the expected scale and data characteristics?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1742", "title": "Federated Learning Optimization for Cross-Device Personalization on Edge NPUs", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you reduce federated learning communication delay, staleness, and non-IID effects on Exynos 2400 devices?", "chain_ids": ["mobile-chain-auto-013-06"], "chain_positions": {"mobile-chain-auto-013-06": 3}, "chain_tiers": {"mobile-chain-auto-013-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1743", "title": "Federated Learning on Apple A17 Pro: Optimizing for Non-IID Data and Communication Efficiency", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a FedAvg system for the A17 Pro considering its 35 TOPS, 5W budget, and non-IID data challenges?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1744", "title": "DP-SGD and Privacy Budgeting on Snapdragon NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do epsilon, delta, noise calibration, and the privacy-utility tradeoff apply to DP-SGD on Snapdragon Hexagon NPU?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1745", "title": "DP-SGD on Mobile NPU: Securing User Data for Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you implement differentially private federated learning for a MobileNetV2 health model on the mobile NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1746", "title": "DP-SGD Privacy-Utility Tradeoff Diagnosis for Mobile Recommendations", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose poor DP-SGD recommendation utility while preserving an annual epsilon budget of 8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1747", "title": "On-Device DP-SGD for Federated Learning on Snapdragon 8 Gen 3 Hexagon NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design the noise calibration mechanism and manage the privacy-utility tradeoff given NPU constraints?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 2}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1748", "title": "DP-SGD on Google Tensor G3 for On-Device Federated Learning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calibrate clipping norms and Gaussian noise for DP-SGD under on-device memory and compute limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1749", "title": "Optimizing DP-SGD on Snapdragon 8 Gen 3 Hexagon NPU for Mobile Privacy", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize DP-SGD clipping and noise generation on a mobile NPU without weakening privacy guarantees?", "chain_ids": ["mobile-chain-auto-secondary-008-05"], "chain_positions": {"mobile-chain-auto-secondary-008-05": 1}, "chain_tiers": {"mobile-chain-auto-secondary-008-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1750", "title": "Designing a Fair AI System on Edge with Samsung Exynos 2400 NPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an on-device fairness evaluation pipeline for moderation under NPU memory and compute limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1751", "title": "On-Device Fairness Evaluation for Image Classification on Apple A17 Pro", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What TOPS, power, and memory budget support on-device demographic-parity monitoring without hurting user experience?", "chain_ids": ["mobile-chain-auto-secondary-013-17"], "chain_positions": {"mobile-chain-auto-secondary-013-17": 0}, "chain_tiers": {"mobile-chain-auto-secondary-013-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1752", "title": "Diagnosing Bias in NPU-Accelerated Facial Verification for Mobile", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What metrics would you collect, and what steps would you take to systematically pinpoint the root cause of this observed unfairness?", "chain_ids": ["mobile-chain-auto-secondary-013-18"], "chain_positions": {"mobile-chain-auto-secondary-013-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1753", "title": "Designing a Fair and Efficient On-Device ML System for Content Moderation on Google Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you specify a fair, efficient Tensor G3 content moderation model within 50ms latency and 200MB footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1754", "title": "Fairness Evaluation of On-Device Facial Recognition on Apple A17 Pro", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an evaluation plan to compare these two architectures for fairness, explicitly considering the constraints and capabilities of the target device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1755", "title": "On-Device Fairness Evaluation Architecture for Facial Recognition on Snapdragon Hexagon NPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an architecture and methodology for continuously monitoring and reacting to potential demographic parity violations on-device within memory constraints?", "chain_ids": ["mobile-chain-auto-secondary-013-18"], "chain_positions": {"mobile-chain-auto-secondary-013-18": 2}, "chain_tiers": {"mobile-chain-auto-secondary-013-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1756", "title": "On-device Fairness Optimization for Google Tensor G3", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and optimize Tensor G3 hardware bottlenecks that worsen subgroup fairness under high load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1757", "title": "Bias Mitigation on Edge AI: NPU", "topic": "fairness-evaluation", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What metrics would you prioritize, and how would you adapt your evaluation approach to measure intersectional fairness given a 12 GB memory and 34.7 TOPS edge constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1759", "title": "Responsible AI Governance for On-Device LLM on Samsung Exynos 2400 NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might the model still exhibit biases on-device, and how do mobile NPU constraints complicate Responsible AI governance frameworks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1760", "title": "On-Device Responsible AI Guardrail Overhead on Snapdragon NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the additional INT8 TOPS and average memory bandwidth required by this guardrail?", "chain_ids": ["mobile-chain-auto-secondary-011-13"], "chain_positions": {"mobile-chain-auto-secondary-011-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-011-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1761", "title": "Bias Detection in an On-Device Summarization LLM", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose a post-deployment gender-bias regression in an on-device Tensor G3 summarization LLM?", "chain_ids": ["mobile-chain-auto-secondary-011-15"], "chain_positions": {"mobile-chain-auto-secondary-011-15": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1762", "title": "On-Device Responsible AI: Content Moderation Guardrails on Samsung Exynos 2400 NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design responsible AI architecture and governance for Exynos 2400 on-device video content moderation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1763", "title": "Responsible AI Evaluation on Snapdragon 8 Gen 3 for On-Device Sentiment Analysis", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you choose between two mobile sentiment models when fairness tail latency conflicts with average efficiency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1764", "title": "Realizing Responsible AI on Google Tensor G3: On-Device Content Moderation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Tensor G3 on-device content moderation system handle guardrails, impact assessment, and resource limits?", "chain_ids": ["mobile-chain-auto-secondary-011-15"], "chain_positions": {"mobile-chain-auto-secondary-011-15": 2}, "chain_tiers": {"mobile-chain-auto-secondary-011-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1765", "title": "Optimizing Responsible AI Guardrails on Samsung Exynos NPU", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you optimize Exynos 2400 responsible AI guardrails to reduce total inference latency from 350ms back down to the 200ms target?", "chain_ids": ["mobile-chain-auto-secondary-011-14"], "chain_positions": {"mobile-chain-auto-secondary-011-14": 1}, "chain_tiers": {"mobile-chain-auto-secondary-011-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1766", "title": "Designing Responsible AI for On-Device LLM Deployment on Apple A17 Pro", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a Responsible AI framework for an A17 Pro on-device LLM handling sensitive personal data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1767", "title": "Token-Level Safety Classifier Bottleneck", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 200M INT8 safety classifier cut Gemini Nano generation from 20 to 5 tokens/sec on Tensor G3?", "chain_ids": ["mobile-chain-auto-secondary-009-01"], "chain_positions": {"mobile-chain-auto-secondary-009-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-009-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1768", "title": "Tensor G3 TPU Memory Bandwidth Bottleneck", "topic": "accelerator-comparison", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the TPU compute capacity severely underutilized, and what physical characteristic constrains the generation rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1769", "title": "On-Device Coreset Selection for Gemini Nano", "topic": "data-efficiency-selection", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does random pruning cause model collapse under these hardware constraints, and what is the tradeoff of switching to gradient-based coreset selection?", "chain_ids": ["mobile-chain-auto-secondary-014-18"], "chain_positions": {"mobile-chain-auto-secondary-014-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-18": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1770", "title": "Analyzing NAS Latency on A17 Pro", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can an A17 Pro NAS model with 30% fewer MACs have 40% higher latency, and how would you analyze it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1771", "title": "Data Validation Gate Memory Bottleneck", "topic": "data-quality-validation", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does an INT8 autoencoder gate plus 1 GB main model miss a 33.3 ms video deadline on Snapdragon 8 Gen 3?", "chain_ids": ["mobile-chain-auto-secondary-012-02"], "chain_positions": {"mobile-chain-auto-secondary-012-02": 1}, "chain_tiers": {"mobile-chain-auto-secondary-012-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1780", "title": "Roofline Comparison: CPU vs Neural Engine for LLM on A17 Pro", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does roofline analysis guide the choice between CPU and Neural Engine for a batch-1 INT4 7B LLM decode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1781", "title": "Attention Head Pruning for On-Device Transformer on Snapdragon 8 Gen 3", "topic": "attention-scaling", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the latency and accuracy trade-offs of pruning 50% of attention heads in a 6-layer 12-head transformer on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1782", "title": "Dynamic Quantization vs Static Quantization for Transformer on Tensor G3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you choose between dynamic and static quantization for a Tensor G3 assistant model with variable activation ranges?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1783", "title": "Unified Memory Architecture Impact on LLM Decode on A17 Pro", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much does UMA contention slow 7B INT4 LLM decode under display, camera, and background bandwidth load?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 1}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1784", "title": "Batch Size 1 Optimization for Interactive LLM on Snapdragon 8 Gen 3", "topic": "batching-strategies", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain the utilization gap and propose how to increase decode throughput?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1785", "title": "Energy per Inference on A17 Pro Neural Engine and CPU Cores", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much energy does a 50M-FLOP inference use on A17 Pro Neural Engine, E-cores, and P-cores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1786", "title": "Core ML Palettization vs GGUF Quantization for LLM on iPhone", "topic": "model-serving-infrastructure", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does GGUF Q4_K_M reach 15 tok/s while Core ML palettization reaches 12 tok/s for Llama-3-8B on iPhone 15 Pro?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1787", "title": "Unstructured Pruning Latency on Snapdragon 8 Gen 3", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning on MobileViT-XS yield only a 15% latency reduction on Snapdragon 8 Gen 3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1788", "title": "Thermal Aware Inference Scheduling on iPhone During Photo Processing", "topic": "thermal-management", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should a photo ML task schedule 5s of A17 Pro Neural Engine work during 4K30 capture at thermal state 2?", "chain_ids": ["mobile-chain-auto-secondary-013-23"], "chain_positions": {"mobile-chain-auto-secondary-013-23": 1}, "chain_tiers": {"mobile-chain-auto-secondary-013-23": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1789", "title": "On-Device KV Cache Memory Budget for LLM", "topic": "attention-scaling", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV-cache memory does the 2B INT4 LLM use at 3000 tokens with 28 layers, 16 KV heads, and FP16 cache?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1790", "title": "OOM Diagnosis for LLM Context Extension on A17 Pro", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the memory growth pattern and identify what causes the OOM at 2048 tokens?", "chain_ids": ["mobile-chain-auto-014-20"], "chain_positions": {"mobile-chain-auto-014-20": 1}, "chain_tiers": {"mobile-chain-auto-014-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1791", "title": "Multi-Task Model Specification for On-Device AI Assistant on A17 Pro", "topic": "model-serving-infrastructure", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which model sizes, quantization levels, and execution units fit ASR, intent, and LLM generation in 1.5 GB on iPhone 15 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1793", "title": "Grouped Query Attention Memory Impact for Long Context on Snapdragon 8 Gen 3", "topic": "attention-scaling", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What KV cache size does GQA use for a 16K-context Mistral-7B, and how does it compare with MHA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1794", "title": "On-Device Training with LoRA Adapter on A17 Pro — Feasibility Analysis", "topic": "pruning-sparsity", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Is LoRA fine-tuning a 3B INT4 iOS language model with rank 8 adapters and 100 examples feasible on A17 Pro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1795", "title": "Mixed-Precision BERT NaN Loss with Loss Scaling on Tensor G3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why might mixed-precision BERT training for Tensor G3 hit NaN loss at step 1200 despite loss scaling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1796", "title": "EfficientViT MSA and FFN Roofline on Snapdragon NPU", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do EfficientViT MSA and FFN blocks reach 32 TOPS and 8 TOPS on a 45 TOPS Snapdragon NPU, and what optimizations apply to each?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1797", "title": "Energy-Delay Product Optimization for Inference on A17 Pro", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate energy-delay product (EDP) for each mode and determine when each is optimal?", "chain_ids": ["mobile-chain-bucket-powerbud-05"], "chain_positions": {"mobile-chain-bucket-powerbud-05": 2}, "chain_tiers": {"mobile-chain-bucket-powerbud-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1798", "title": "Adaptive Bitrate Quantization Based on Thermal State on Snapdragon 8 Gen 3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Design the transition logic with hysteresis to allow switching among INT8, INT4, and INT2 thermal states without oscillating?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1801", "title": "Mobile Secure Aggregation Fan-In", "topic": "federated-learning", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How many secure-aggregation validation shards are needed to process 80,000 mobile updates within 4 seconds?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1805", "title": "Shared Mmap vs Heap Allocation for Multi-Process Edge LLM", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate the tradeoffs between heap allocation and shared memory-mapped (mmap) file access, and determine which architecture is required for this system?", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1806", "title": "Implementing Shared mmap for Edge LLMs", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much RAM does shared read-only mmap save for two Jetson Orin processes loading the same 15 GB INT8 LLM, and which flags are required?", "chain_ids": ["mobile-chain-auto-secondary-014-06"], "chain_positions": {"mobile-chain-auto-secondary-014-06": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1807", "title": "CoreML vs ONNX Runtime on Apple A17 Pro Neural Engine", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How would you explain and reduce the 50% latency overhead of ONNX Runtime versus direct CoreML on an A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-016-19"], "chain_positions": {"mobile-chain-auto-secondary-016-19": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1808", "title": "ONNX to CoreML and QNN Runtime Strategy for Mobile Style Transfer", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which iOS and Android runtimes meet <20 ms style transfer on A17 Pro and Snapdragon 8 Gen 3 from one model codebase?", "chain_ids": ["mobile-chain-auto-secondary-016-19"], "chain_positions": {"mobile-chain-auto-secondary-016-19": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1809", "title": "CoreML vs MPSGraph for A17 Pro Diffusion ANE Support", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Should an A17 Pro diffusion model use CoreML or MPSGraph when attention-heavy layers limit ANE support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1810", "title": "Snapdragon QNN SDK vs TFLite for Hexagon NPU Utilization", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does QNN run MobileNetV3-large at 1.8ms versus 3.2ms for TFLite Hexagon on Snapdragon 8 Gen 3, and which should you use?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1811", "title": "Portable On-Device LLM Inference Across Mobile Platforms", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you serve a 3B LLM at over 10 tok/s on both A17 Pro and Snapdragon 8 Gen 3 without exceeding mobile memory?", "chain_ids": ["mobile-chain-auto-secondary-016-19"], "chain_positions": {"mobile-chain-auto-secondary-016-19": 2}, "chain_tiers": {"mobile-chain-auto-secondary-016-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1812", "title": "Android NNAPI Delegation Failures and Fallback Performance", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Why does the model run 9x slower on the Exynos NNAPI despite the delegate being active?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1813", "title": "WebAssembly SIMD as a Universal Mobile Inference Backend", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate whether WASM SIMD can replace native mobile inference for a MobileNetV2 model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1814", "title": "Thermal Throttling Impact on Portable Model Benchmarks", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Why do mobile benchmarks deteriorate, and how does this affect portability comparisons?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1815", "title": "Portable Feature Extraction Pipeline: Camera Input to Model Input", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would one preprocessing interface convert iOS BGRA and Android YUV_420_888 frames into a float tensor efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1816", "title": "Unified Model Versioning for Multi-Platform Mobile Deployment", "topic": "software-portability", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How should a mobile team manage 12 models, 3 quality tiers, and monthly updates without version drift?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1817", "title": "On-Device Interconnect Bottleneck for NPU Inference", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the NoC bandwidth, not the NPU's TOPS rating, determine achievable throughput for LLM inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1818", "title": "SoC Die-to-Die Interconnect for Heterogeneous Inference", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the layer-to-accelerator mapping that minimizes cross-accelerator data transfer?", "chain_ids": ["mobile-chain-auto-secondary-017-09"], "chain_positions": {"mobile-chain-auto-secondary-017-09": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1819", "title": "Chiplet Interconnect Bandwidth for Multi-Model Mobile", "topic": "interconnect-topology", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate total memory bandwidth demand and determine if the NoC is the bottleneck?", "chain_ids": ["mobile-chain-auto-secondary-017-09"], "chain_positions": {"mobile-chain-auto-secondary-017-09": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1820", "title": "On-Device Differential Privacy for Mobile ML", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With 100M users each contributing one data point per day, what epsilon provides meaningful utility, and how does LDP accuracy compare to central DP?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1821", "title": "DP Noise Calibration on Mobile NPU", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the standard deviation of Gaussian noise you must add to the clipped gradient (norm=1.0), and how much memory does the noise generation require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1822", "title": "Differential Privacy Impact on On-Device Model Size", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With batch size 32, how much memory do per-sample gradients consume, and does this fit alongside the model and activations within the OS app limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1833", "title": "Understanding why autograd is disabled on Core ML and how to replicate gradient-based effects", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How can a mobile app support few-shot adaptation when inference frameworks do not provide backward passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1834", "title": "Minimizing computational graph size for on-device fine-tuning on Snapdragon 8 Gen 3", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you shrink the autograd graph for LoRA fine-tuning a 1B LLM?", "chain_ids": ["mobile-chain-auto-secondary-016-15"], "chain_positions": {"mobile-chain-auto-secondary-016-15": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1835", "title": "Implementing memory-efficient backprop for on-device RL on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you compute PPO old-policy ratios without keeping two full autograd graphs in memory?", "chain_ids": ["mobile-chain-auto-secondary-016-15"], "chain_positions": {"mobile-chain-auto-secondary-016-15": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1836", "title": "Debugging gradient vanishing in a mobile RNN model on Snapdragon 8 Gen 3", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you diagnose and fix vanishing recurrent gradients in a 500-step GRU trained natively on a mobile accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1837", "title": "Optimizing Core ML conversion of custom autograd operations on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you convert a custom sparse attention autograd operation to Core ML for NPU deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1838", "title": "Efficient gradient-based hyperparameter search within 8GB on Snapdragon", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you make DARTS architecture search fit in 8GB on Snapdragon without running the full supernet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1839", "title": "Profiling PyTorch Mobile image inference tensor copies", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you profile and remove unnecessary tensor copies in a PyTorch Mobile image pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1840", "title": "Implementing gradient-based input attribution for on-device explainability on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you implement Integrated Gradients for on-device explainability on A17 Pro without 50 sequential passes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1841", "title": "Managing the autograd tape for battery-constrained on-device learning on Snapdragon", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you manage autograd and update scheduling so background personalization does not drain battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1842", "title": "Designing a two-phase autograd pipeline for on-device neural style transfer on A17 Pro", "topic": "autograd-computational-graphs", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you structure neural style transfer so only the input image needs gradients, keeping VGG efficient enough to beat a 60-second target?", "chain_ids": ["mobile-chain-auto-secondary-016-15"], "chain_positions": {"mobile-chain-auto-secondary-016-15": 2}, "chain_tiers": {"mobile-chain-auto-secondary-016-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1843", "title": "Chiplet Compute-to-Memory Ratio for Mobile SoC Design", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you explain the design rationale for matching compute-to-memory ratio to target workloads?", "chain_ids": ["mobile-chain-auto-secondary-016-16"], "chain_positions": {"mobile-chain-auto-secondary-016-16": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1844", "title": "Apple A17 Pro Die Architecture and ANE Integration", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "How does the A17 Pro's Neural Engine integrate with the CPU and GPU at the die level, and why does this differ from external accelerator chiplets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1845", "title": "Snapdragon 8 Gen 3 Hexagon NPU Memory Architecture", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you profile the memory hierarchy to explain the 40ms decode latency for a 1B model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1846", "title": "Evaluating UCIe chiplets for Snapdragon mobile ML accelerators", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can UCIe integrate a third-party ML accelerator into Snapdragon within mobile power, package, and latency limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1847", "title": "A17 Pro Object Detection Thermal Throttling", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What throttling factor explains the 25ms to 35ms slowdown, and what mitigation fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1848", "title": "DPO for Mobile On-Device Alignment: Feasibility Analysis", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How would you evaluate whether DPO or PPO is feasible and calculate the minimum memory requirements?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1849", "title": "LoRA Inference Optimization on A17 Pro Neural Engine", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Why does Core ML reject dynamic LoRA on A17 Pro, and how should the adapters be compiled?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1850", "title": "Differential privacy for A17 Pro keyboard LoRA training", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you design a differential privacy mechanism for rank-4 on-device LoRA training on the A17 Pro?", "chain_ids": ["mobile-chain-auto-secondary-016-18"], "chain_positions": {"mobile-chain-auto-secondary-016-18": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1851", "title": "Quantized LoRA Serving on Snapdragon 8 Gen 3", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should BF16 LoRA adapters run on Snapdragon 8 Gen 3 when Hexagon supports only INT8/INT4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1852", "title": "Multi-Adapter Session Management on Mobile", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design the session management system to efficiently switch between 5 LoRA adapters on an A17 Pro with 8 GB RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1853", "title": "Privacy Budget Allocation for On-Device Fine-Tuning", "topic": "differential-privacy", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a per-epoch budget allocation strategy that maximizes model utility while respecting the lifetime budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1855", "title": "Guardrail Latency Budget on a Mobile LLM", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which guardrail mix fits a 200 ms safety budget after a 600 ms prefill on an NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1856", "title": "Algorithmic Accountability for On-Device Health Decisions", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a responsible AI governance framework that operates within mobile compute constraints and does not transmit raw health data off-device?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1857", "title": "ONNX Runtime CPU Fallback from Missing NPU Execution Provider", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you diagnose the ONNX Runtime portability issue for operators falling back to the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1858", "title": "Unified Memory Coherency for Mobile Chiplet Camera Pipeline", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and fix unified-memory cache coherency stalls between an ISP die and NPU die in a camera pipeline?", "chain_ids": ["mobile-chain-auto-secondary-016-16"], "chain_positions": {"mobile-chain-auto-secondary-016-16": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1859", "title": "Multi-Chiplet Thermal Throttle Recovery for Mobile Gaming", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you explain NPU throttling in a mobile chiplet gaming workload and redesign the thermal schedule?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1860", "title": "On-Device LLM Cold-Start and KV Setup Latency", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose and fix 900 ms of cold tokenization and KV setup for a mobile Gemma 2B prompt, assuming 18 layers, 8 KV heads, and 256-dim FP16 KV?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1861", "title": "Chiplet Die-Count Impact on Mobile SoC PCB Routing", "topic": "chiplet-architecture", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you justify moving a chiplet mobile SoC board from 6 to 8 PCB layers for routing, power, and EMI risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1862", "title": "On-Device LoRA OOM on an 8GB Phone", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does LoRA training for a 1B model OOM at batch 1 on an 8GB phone, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1863", "title": "Per-User LoRA Adapter Privacy on Mobile", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which privacy attacks are realistic for on-device per-user LoRA adapters, and what mitigation plan would you ship for limited local training and storage?", "chain_ids": ["mobile-chain-auto-secondary-016-18"], "chain_positions": {"mobile-chain-auto-secondary-016-18": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1864", "title": "Adapter Selection Latency in Multi-Persona Mobile Assistant", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you reduce mobile assistant LoRA adapter selection and loading latency from 200ms to under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1865", "title": "RLHF-Lite: Implicit Feedback Collection for Mobile Fine-Tuning", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design a lightweight on-device RLHF pipeline that runs without a dedicated reward model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1866", "title": "LoRA Adapter Compatibility Across Model Version Updates", "topic": "model-adaptation-systems", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Phi-3 Mini v1.0 LoRA adapters degrade for 15% of users after a v1.1 base-model update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1870", "title": "A17 Pro NPU Video Frame Drops", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the required frame buffer size K such that the M/M/1/K finite-queue blocking probability P_K is strictly less than 1%?", "chain_ids": ["mobile-chain-auto-secondary-014-27"], "chain_positions": {"mobile-chain-auto-secondary-014-27": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1871", "title": "Federated Learning Update Window", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can the weight update complete within the OS background window without further compression?", "visual": {"kind": "svg", "path": "mobile-1871.svg", "alt": "Node diagram showing mobile app attempting to push 10 MB through a 2 Mbps 4G pipe to a cloud aggregator.", "caption": "Federated Update Transmission"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1872", "title": "A17 Pro LLM Concurrent Queue Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Evaluate the maximum average request arrival rate (requests per second) the NPU can handle while ensuring the average queue wait time does not exceed 1.6 seconds?", "visual": {"kind": "svg", "path": "mobile-1872.svg", "alt": "Curve showing queue wait time increasing as the arrival rate approaches the NPU capacity of 2.5 requests per second.", "caption": "M/D/1 Wait Time Analysis"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1873", "title": "Mobile LoRA Atomic Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum LoRA adapter size that can be atomically saved within a 50ms OS kill warning window, and what strategy ensures safety?", "visual": {"kind": "svg", "path": "mobile-1873.svg", "alt": "Timeline showing OS kill warning at T=0, 50ms write operation to a temp file, followed by an atomic rename.", "caption": "Atomic Save within OS Warning Window"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1874", "title": "Split Computing BLE Bandwidth", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum allowable intermediate tensor size to ensure the BLE transmission adds no more than 100ms of latency?", "visual": {"kind": "svg", "path": "mobile-1874.svg", "alt": "Directed graph showing model layers on a watch, a BLE transmission bottleneck, and final layers on a phone.", "caption": "Watch-to-Phone Split Architecture"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1875", "title": "Snapdragon Segmentation Video Uplink", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What optimization to the binary mask transmission architecture ensures the frame push stays within the 5 Mbps bandwidth limit?", "visual": {"kind": "svg", "path": "mobile-1875.svg", "alt": "Graph showing raw segmentation masks passed through an RLE encoder before hitting the cellular uplink bottleneck.", "caption": "Segmentation Mask Uplink Optimization"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1876", "title": "A17 Pro NVMe Intermediate State Checkpoint", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the storage latency to write the checkpoint and determine if it is faster to recompute the frame (recompute takes 15ms)?", "visual": {"kind": "svg", "path": "mobile-1876.svg", "alt": "A bar chart comparing the 15ms recompute time against the combined 8ms read/write checkpointing time.", "caption": "Recompute vs Checkpoint Latency"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1877", "title": "Snapdragon LPDDR5 Weight Bound", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the absolute minimum latency to evaluate a single dense linear layer containing 10 million weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1879", "title": "WASM ML Model Fetch Over 3G", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total download time and determine the optimal chunk size to provide an interactive loading bar updating 10 times a second?", "visual": {"kind": "svg", "path": "mobile-1879.svg", "alt": "Diagram showing a monolithic 20MB model being sliced into 50KB chunks for streaming over a slow network.", "caption": "WASM Model Chunking"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1880", "title": "A17 Pro 3B Model Sequence Limit", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum sequence length achievable before the KV cache exceeds the strict 2.0 GB OS memory budget?", "visual": {"kind": "svg", "path": "mobile-1880.svg", "alt": "A bar chart breaking down the 2.0GB memory limit into 1.5GB of INT4 weights and 0.5GB of remaining KV cache space.", "caption": "iOS 2.0GB App Memory Budget"}, "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 5}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1881", "title": "5G Model Update Bottleneck", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What stage rate binds the Cloud-to-NPU load pipeline under a 64MB ring, and is the 3-minute SLA feasible?", "visual": {"kind": "svg", "path": "mobile-1881.svg", "alt": "A linear pipeline diagram showing data flowing left-to-right from the Cloud through the 5G modem, the Crypto/UFS stage, and into the phone's NPU.", "caption": "Model download and local transfer pipeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1882", "title": "A17 NPU Deterministic Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the average queue depth and total latency per token, treating the system as an M/D/1 queue?", "visual": {"kind": "svg", "path": "mobile-1882.svg", "alt": "Line chart comparing M/M/1 vs M/D/1 wait times.", "caption": "M/M/1 vs M/D/1 delay comparison."}, "chain_ids": ["mobile-chain-auto-secondary-014-25"], "chain_positions": {"mobile-chain-auto-secondary-014-25": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1883", "title": "Split Compute RTT and Payload Bandwidth Match", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "With 12 T-Ops total, 40% offloaded, a 3 MB payload, and 100 ms RTT, what bandwidth matches a 40 TOPS local NPU?", "visual": {"kind": "svg", "path": "mobile-1883.svg", "alt": "Diagram showing task split between local NPU and Cloud.", "caption": "Split computation latency paths."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1884", "title": "KV Cache Memory Sizing for A17 LLM Context Limit", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the memory pressure by computing the peak KV cache size and determine if it exceeds a strict 1 GB application limit?", "visual": {"kind": "svg", "path": "mobile-1884.svg", "alt": "Bar chart comparing 1 GB cache size against the 1 GB limit.", "caption": "KV Cache capacity limit."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1885", "title": "Snapdragon Camera Pipelining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum throughput (fps) and end-to-end latency if only the ISP and NPU can run concurrently in the pipeline architecture layout?", "visual": {"kind": "svg", "path": "mobile-1885.svg", "alt": "Gantt chart showing parallel pipeline stages bottlenecked by 10ms NPU.", "caption": "Concurrent pipeline stages."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1886", "title": "A17 Background Queue Deep State", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create an optimized scheduling limit to ensure the probability of having more than 3 tasks in the queue remains under 15%?", "visual": {"kind": "svg", "path": "mobile-1886.svg", "alt": "Line chart showing queue probabilities decaying exponentially.", "caption": "Probability of N tasks in the system."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1887", "title": "Snapdragon KV Memory Limit", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the maximum sequence length supported under this 500 MB footprint constraint?", "visual": {"kind": "svg", "path": "mobile-1887.svg", "alt": "Bar chart comparing token context counts and their memory consumption.", "caption": "Token limit given a 500MB constraint."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1888", "title": "FedAvg Wi-Fi Star vs Ring", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the synchronization time using a Star topology (one device acts as the parameter server) versus a Ring topology?", "visual": {"kind": "svg", "path": "mobile-1888.svg", "alt": "Star topology vs Ring topology comparison.", "caption": "Star topology for Federated Averaging."}, "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 2}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1889", "title": "Vision Framework Pipelining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply pipeline engineering to determine the latency to process the 10th frame under full saturation?", "visual": {"kind": "svg", "path": "mobile-1889.svg", "alt": "Horizontal bars representing CPU, GPU, and NPU stages overlapping.", "caption": "Video frame pipeline stages."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1890", "title": "NPU-to-CPU Fallback Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply the M/M/1 infinite-queue tail approximation to estimate the fraction of arrivals routed to the CPU, and note where this approximation overestimates versus the exact M/M/1/5 blocking formula?", "visual": {"kind": "svg", "path": "mobile-1890.svg", "alt": "Line graph showing the 23.7% tail of the probability distribution above N=5.", "caption": "Probability of CPU Fallback."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1891", "title": "A17 OS Termination Grace", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Can checkpointing fit in the 1.5s grace window after serialization, UFS contention, and CPU/I/O overlap?", "visual": {"kind": "svg", "path": "mobile-1891.svg", "alt": "Gantt showing 75ms save fitting easily within 1500ms grace period.", "caption": "State Save vs Grace Period."}, "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 4}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1893", "title": "Mobile AR Frame Queuing", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does the average queuing delay of an M/M/1 model meet a strict 100ms SLA for AR frames?", "visual": {"kind": "svg", "path": "mobile-1893.svg", "alt": "A hockey-stick curve plotting average delay against arrival rate, showing exponential growth as utilization nears 100%.", "caption": "M/M/1 Queuing Delay"}, "chain_ids": ["mobile-chain-auto-secondary-014-27"], "chain_positions": {"mobile-chain-auto-secondary-014-27": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1895", "title": "A17 Pro NPU Keyword Spotting", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Describe the duty cycle percentage and compute the total active execution time the NPU spends over a full 1-minute period?", "visual": {"kind": "svg", "path": "mobile-1895.svg", "alt": "A timeline depicting brief 5ms high states followed by 45ms low states repeating regularly.", "caption": "NPU Duty Cycle (10%)"}, "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1896", "title": "Snapdragon Cache Hit Effective BW", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why the bar-chart arithmetic-mean reading of 92 GB/s overstates the true effective bandwidth, and compute the correct time-weighted (harmonic-mean) effective bandwidth?", "visual": {"kind": "svg", "path": "mobile-1896.svg", "alt": "Side-by-side bar chart contrasting 100 GB/s L3 cache bandwidth with 60 GB/s main memory, with stacked bars showing the 80% / 20% hit/miss split that produces the 88.2 GB/s effective bandwidth.", "caption": "Cache vs Main Memory Bandwidth weighted by 80/20 hit ratio"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1899", "title": "Mobile NLP Pipeline Bottleneck", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the critical path bottleneck and calculate the absolute maximum throughput in tokens per second for this pipeline?", "visual": {"kind": "svg", "path": "mobile-1899.svg", "alt": "A bar chart of stage durations highlighting the 10ms NPU Transformer block as significantly taller than the rest.", "caption": "Pipeline Stage Durations"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1901", "title": "Snapdragon Always-On Energy", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply the duty cycle to calculate the total energy consumed strictly in Joules over exactly 1 hour of operation?", "visual": {"kind": "svg", "path": "mobile-1901.svg", "alt": "A duty cycle timeline showing power hovering at 2mW and spiking to 50mW for 10% of every period.", "caption": "Listening Model Energy Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1902", "title": "A17 Pro Off-Chip Spilling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an allocation strategy to minimize DRAM bandwidth waste, and compute the arithmetic intensity required to avoid bottling on RAM?", "visual": {"kind": "svg", "path": "mobile-1902.svg", "alt": "Bar chart comparing Apple A17 Pro SRAM bandwidth to LPDDR5 system memory bandwidth.", "caption": "Memory Hierarchy Bandwidth on Mobile SoC"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1903", "title": "Mobile AR Pipelining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "From the 29.4 FPS baseline, what steady-state FPS does double-buffering unlock, and which stage remains binding?", "visual": {"kind": "svg", "path": "mobile-1903.svg", "alt": "Gantt chart showing sequential CPU then NPU execution per frame.", "caption": "Sequential Processing Pipeline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1904", "title": "TTS Chunk Queue Wait", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the expected wait time in the queue for a new text chunk using M/M/1 formulas?", "visual": {"kind": "svg", "path": "mobile-1904.svg", "alt": "Plot showing wait time rapidly increasing as arrival rate approaches service rate.", "caption": "Wait Time vs Arrival Rate"}, "chain_ids": ["mobile-chain-auto-secondary-014-27"], "chain_positions": {"mobile-chain-auto-secondary-014-27": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1905", "title": "5G Uplink Conversion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the bandwidth limit of the 5G connection in megabytes per second to determine if a 15 MB image can be offloaded in under 1 second?", "visual": {"kind": "svg", "path": "mobile-1905.svg", "alt": "Node diagram showing mobile device connected to cloud via a 100 Mbps link.", "caption": "Mobile Offload Link"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1907", "title": "Checkpoint Frequency vs Flash Wear in Mobile Style Transfer", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the tradeoff between checkpointing frequency and flash memory wear-out for this mobile application processing at 30 FPS?", "visual": {"kind": "svg", "path": "mobile-1907.svg", "alt": "Line graph showing cumulative GBs written over time growing extremely rapidly at 30 FPS.", "caption": "Flash Wear-Out Accumulation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1909", "title": "Mobile NPU Wake-Word Duty-Cycling Overhead", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the average power consumption including a 2ms wake-up transient overhead that draws 200mW?", "visual": {"kind": "svg", "path": "mobile-1909.svg", "alt": "Timeline showing active, sleep, and transient power spikes across multiple 20ms periods.", "caption": "Duty-Cycling Power Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1910", "title": "Mobile NPU SRAM Spilling Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the added latency bottleneck per layer transition caused by spilling and reloading this 16MB tensor?", "visual": {"kind": "svg", "path": "mobile-1910.svg", "alt": "Horizontal bar chart showing the massive bandwidth difference between internal SRAM and external LPDDR5.", "caption": "Memory Tier Bandwidth"}, "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 2}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1911", "title": "Context Switching Mobile Inference", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What architectural state must be saved to RAM to ensure the vision model can resume precisely without data corruption?", "visual": {"kind": "svg", "path": "mobile-1911.svg", "alt": "Timeline showing active NPU state, an interrupt spike, context save overhead, and context restore.", "caption": "Preemptive Context Switch"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1912", "title": "INT4 Quantization Bandwidth Impact", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "By what factor are memory bandwidth requirements reduced, and what is the secondary cache effect?", "visual": {"kind": "svg", "path": "mobile-1912.svg", "alt": "Bar chart comparing FP16 footprint spilling out of SRAM vs INT4 footprint fitting entirely inside.", "caption": "Cache Footprint Reduction"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1913", "title": "Federated Learning Disconnection Resilience", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain the state preservation strategy required to resume training immediately when connectivity is restored?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1914", "title": "Zero-Copy Video Pipeline on Apple Silicon", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose an asynchronous iOS camera-to-Metal-to-Core ML pipeline that prevents CPU copy bottlenecks?", "visual": {"kind": "svg", "path": "mobile-1914.svg", "alt": "Block diagram showing sequential pointer handoffs instead of data copies between ISP, GPU, and NPU.", "caption": "Zero-Copy IOSurface Pipeline"}, "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 4}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1915", "title": "Tiered Sensor Hub Data Batching", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a tiered duty-cycling system utilizing the always-on sensor hub to minimize main CPU wakeups?", "visual": {"kind": "svg", "path": "mobile-1915.svg", "alt": "Graph showing the sensor hub buffer filling slowly and dropping sharply when the main CPU wakes to drain it.", "caption": "Sensor Hub Buffer Fill and Drain"}, "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1916", "title": "Unified Memory Architecture for Apple Silicon GPU and Neural Engine Tensor Exchange", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Identify the primary memory architecture used to exchange multi-megabyte tensors between the NPU and GPU without copying?", "visual": {"kind": "svg", "path": "mobile-1916.svg", "alt": "Architecture diagram showing CPU, GPU, and NPU all pointing directly to a shared unified memory block.", "caption": "Apple Unified Memory Architecture"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1917", "title": "A17 Pro NPU Activation Tiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you pipeline UNet activations to avoid system-RAM spills when tensors peak at 120MB and NPU SRAM is 32MB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1918", "title": "Snapdragon NPU Duty-Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the integrated energy-vs-latency trade-off of continuous vs batched duty-cycling policies?", "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1919", "title": "A17 Pro LLM Quantization Protocol", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you test whether a Core ML-supported 4-bit path beats INT8 on latency-accuracy, while detecting off-ANE fallback?", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 3}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1920", "title": "Snapdragon NPU/GPU Cache Thrashing", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate a diagnostic plan using performance counters to pinpoint SLC cache thrashing between the NPU and GPU, and propose a tiling fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1922", "title": "Android Bound Service with TFLite Native Inference for NLP Cold Starts", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a TFLite Native C++ bound-service architecture that keeps model weights warm and reduces notification inference cold-start latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1923", "title": "Android OOM Gradient Resumption", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a mobile training checkpoint/resume strategy that survives Android OOM kills while resuming from the exact mini-batch?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 5}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1924", "title": "A17 Pro CPU-NPU Pipelining", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Implement a producer-consumer queue that pipelines CPU pre-processing of frame N+1 with NPU inference of frame N, then derive the realized throughput from the per-stage timing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1925", "title": "Hardware FIFO Sensor Batching", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a hardware wake-up FIFO strategy to buffer accelerometer data, batching inference every 5 seconds instead of waking the CPU/NPU on every step?", "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1926", "title": "CoreML INT8 Calibration", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply Core ML post-training quantization with calibration while checking ANE compatibility and model-size impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1927", "title": "NPU INT4 Unpacking Penalty", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does an INT4 LLM run slower than an INT8 LLM on an NPU despite requiring half the memory bandwidth?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1929", "title": "A17 NPU Wake Break-Even for Activity Recognition", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the per-second energy accounting to compute the break-even rate for always-on vs rate-gating, and what is the best rate?", "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 2}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1932", "title": "Calculate model memory reduction accounting for static KV cache", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory-savings factor comes from FP16-to-INT4 weights after accounting for the static KV cache?", "chain_ids": ["mobile-chain-auto-017-06"], "chain_positions": {"mobile-chain-auto-017-06": 1}, "chain_tiers": {"mobile-chain-auto-017-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1933", "title": "Evaluate shared memory bandwidth contention during mobile task overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate how perfectly overlapping the NPU and GPU workloads will affect the AR frame latency?", "chain_ids": ["mobile-chain-auto-secondary-016-17"], "chain_positions": {"mobile-chain-auto-secondary-016-17": 2}, "chain_tiers": {"mobile-chain-auto-secondary-016-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1935", "title": "Explain backpressure effects on queue arrival rates during mobile bursts", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain what happens to the arrival rate of the queue when the buffer fills up completely during a sync burst?", "chain_ids": ["mobile-chain-auto-secondary-014-26"], "chain_positions": {"mobile-chain-auto-secondary-014-26": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-26": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1936", "title": "Name tightly coupled memory used for local NPU accelerator caching", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the name of the ultra-fast memory situated closest to the NPU processing elements used to avoid fetching weights repeatedly from main LPDDR memory?", "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 0}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1938", "title": "Symmetric INT8 Quantization on Apple A17 Pro", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you apply symmetric INT8 quantization to this weight tensor, calculating the scale factor and the quantized integer value for a weight of 1.25?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1939", "title": "CoreML Execution Targets on Apple Silicon", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What are the three main hardware execution targets that the CoreML framework abstracts for dynamically routing model operations?", "chain_ids": ["mobile-chain-bucket-modelser-04"], "chain_positions": {"mobile-chain-bucket-modelser-04": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1941", "title": "Super-Resolution Compute Estimation", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you calculate the required compute for 1080p at 30 FPS, and does the NPU have capacity headroom to also support a 4K mode?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1942", "title": "PhotoKit in On-Device ML Pipelines", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the primary function of the iOS PhotoKit framework when used as the data ingestion layer for an on-device ML pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1943", "title": "Federated Learning Flash Memory Wear", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why must saving model updates to flash memory during federated training be carefully balanced against storage wear out?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 1}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1944", "title": "Split-Computing Compression Overhead", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Diagnose why total end-to-end latency increases when compressing feature maps before 5G transmission, despite reducing payload size?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1945", "title": "Diagnosing Low NPU Utilization", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the primary architectural reason why the model might achieve less than 10% of theoretical peak performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1946", "title": "Zero-Copy Video Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Identify the data movement bottleneck if frames are passed as UIImages, and explain how to eliminate it?", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 2}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1949", "title": "Cortex-X4 vs Hexagon TOPS/W on a 10 GOPS Workload", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Compute realized TOPS/W (GOPS/W) for each placement at the 10 GOPS demand point, identify which is more efficient, and explain the architectural source of the gap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1950", "title": "On-Device LLM Bandwidth Bound", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the theoretical upper bound for auto-regressive token generation speed assuming it is entirely memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1952", "title": "Mobile KV-Cache Constraint", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the KV-cache memory footprint for FP16 and determine how many 4096-token sessions can fit within the 2GB allocation?", "visual": {"kind": "svg", "path": "mobile-1952.svg", "alt": "Stacked bar showing 3.5GB Weights and 2.0GB KV cache", "caption": "RAM Allocation Profile"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1953", "title": "NPU Thermal Energy Bounds", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Evaluate the thermal impact over a 1-hour session by computing total NPU energy consumed?", "visual": {"kind": "svg", "path": "mobile-1953.svg", "alt": "Square wave showing 5ms active peaks and 11.6ms idle valleys", "caption": "NPU Power Duty Cycle over 1 Frame"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1954", "title": "Voice Translation Queue Spike", "topic": "queueing-theory", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the root cause of massive latency lag using queueing theory, computing expected queue length in steady state?", "visual": {"kind": "svg", "path": "mobile-1954.svg", "alt": "Line graph showing queue length shooting up at 0.95 utilization", "caption": "Queue Length vs Utilization"}, "chain_ids": ["mobile-chain-auto-secondary-014-26"], "chain_positions": {"mobile-chain-auto-secondary-014-26": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1956", "title": "Agentic LLM Prefix Cache", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you optimize the KV-cache memory bandwidth usage across multiple consecutive turns?", "visual": {"kind": "svg", "path": "mobile-1956.svg", "alt": "Bar chart showing large shared prefix block and small individual generation blocks", "caption": "Prefix Caching Memory Layout"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1957", "title": "On-Device Async Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you implement an asynchronous checkpointing mechanism that bounds data loss to 10 seconds without slowing the training loop?", "visual": {"kind": "svg", "path": "mobile-1957.svg", "alt": "Timeline showing compute phase overlapping with flash write phase in a separate thread", "caption": "Asynchronous Thread Checkpointing"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1958", "title": "INT8 Context Window Doubling", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how KV-cache quantization (FP16 to INT8) impacts memory limits and maximum sequence length?", "visual": {"kind": "svg", "path": "mobile-1958.svg", "alt": "Bar chart comparing 3GB total RAM for FP16 vs 2GB total RAM for INT8", "caption": "RAM Footprint: FP16 vs INT8 KV Cache"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1959", "title": "P2P Bluetooth Sync", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How long does it take to exchange the 1MB embedding using point-to-point Bluetooth?", "visual": {"kind": "svg", "path": "mobile-1959.svg", "alt": "Two nodes connected by a bidirectional arrow labeled 2 Mbps", "caption": "Point-to-Point Bluetooth Exchange"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1961", "title": "Random Read Memory Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the effective memory bandwidth given the cache miss penalty?", "visual": {"kind": "svg", "path": "mobile-1961.svg", "alt": "Bar chart showing L2 BW, DRAM BW, and the resulting Effective BW being near DRAM", "caption": "Effective Bandwidth Degradation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1962", "title": "Federated Tree Aggregation", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you devise a hierarchical aggregation topology to handle the massive uplink traffic and reduce central server bottlenecking?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1964", "title": "On-Device Checkpoint RPO/RTO", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate a checkpointing schedule to maintain a 5-minute RPO and a 10-second RTO?", "visual": {"kind": "svg", "path": "mobile-1964.svg", "alt": "Timeline displaying training intervals, a 5-minute RPO window, a failure event, and a 10-second RTO recovery.", "caption": "RPO and RTO timeline for on-device training."}, "chain_ids": ["mobile-chain-auto-026-05"], "chain_positions": {"mobile-chain-auto-026-05": 3}, "chain_tiers": {"mobile-chain-auto-026-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1965", "title": "Mobile 7B LLM Memory Tiers", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify which memory tier acts as the primary bandwidth bottleneck during single-batch token generation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1966", "title": "Mobile KV Cache Quantization", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Does INT8 KV cache quantization improve or regress net per-token decode latency on Hexagon given the dequantization compute overhead?", "visual": {"kind": "svg", "path": "mobile-1966.svg", "alt": "Bar chart showing INT8 accommodating double the context length of FP16 under a 500MB budget.", "caption": "Context length capacity under a 500MB KV Cache budget."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1967", "title": "Voice Assistant Bursty Queue", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why do tail latency spikes occur under bursty arrivals compared to uniform arrivals despite the same average utilization?", "visual": {"kind": "svg", "path": "mobile-1967.svg", "alt": "Two queueing curves: bursty traffic shows latency spiking at much lower utilization than uniform traffic.", "caption": "Impact of arrival burstiness on wait time."}, "chain_ids": ["mobile-chain-auto-secondary-014-26"], "chain_positions": {"mobile-chain-auto-secondary-014-26": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1972", "title": "A17 Pro Ring AllReduce Bounds", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical lower bound communication volume sent per device for a single AllReduce pass over a 10MB payload?", "visual": {"kind": "svg", "path": "mobile-1972.svg", "alt": "Four nodes connected in a unidirectional ring.", "caption": "Ring topology for decentralized mobile communication."}, "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 0}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1977", "title": "Mesh AllGather vs Central Server", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the architectural advantage of an AllGather operation in this mesh compared to a central parameter server?", "visual": {"kind": "svg", "path": "mobile-1977.svg", "alt": "A fully connected mesh of nodes showing symmetrical links.", "caption": "Symmetrical Mesh Topology for Distributed AllGather."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1978", "title": "Hexagon NPU TCM Spilling Impact", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the performance impact of missing the NPU's TCM and spilling memory traffic to main memory?", "visual": {"kind": "svg", "path": "mobile-1978.svg", "alt": "Bar chart displaying a massive latency spike when missing TCM and hitting LPDDR5x.", "caption": "Access Latency: TCM vs LPDDR5x Spilling."}, "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 3}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1979", "title": "A17 Pro ISP to NPU Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Determine the maximum achievable frame rate if the ISP and NPU execute sequentially without pipelining?", "visual": {"kind": "svg", "path": "mobile-1979.svg", "alt": "Throughput bar chart showing 2ms ISP write followed by 3ms NPU read.", "caption": "Sequential data pipeline latency stages."}, "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 0}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1980", "title": "Mobile Federated Sync Overhead", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What is the time required to sync the model update to the cloud?", "visual": {"kind": "svg", "path": "mobile-1980.svg", "alt": "Diagram showing mobile device transmitting data to cloud server.", "caption": "Mobile to Cloud uplink bottleneck."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1981", "title": "Snapdragon Zero-Copy Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the copy latency and overall execution time without zero-copy optimization for a 16 MB image?", "visual": {"kind": "svg", "path": "mobile-1981.svg", "alt": "Bar chart showing a small 0.2ms copy overhead before a 2ms inference.", "caption": "Memory copy overhead in the data pipeline."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1982", "title": "A17 Pro AR Burst Processing", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "From the 300ms naive estimate, what is the realistic worst-case queue drain time after cold-cache and arrival effects?", "visual": {"kind": "svg", "path": "mobile-1982.svg", "alt": "Queue length graph starting at 15 and linearly draining to 0 over 300ms.", "caption": "AR Burst queue drain over time."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1983", "title": "Mobile FL Tree Aggregation", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Calculate the total data received by the root node in a balanced binary tree, compared to a flat star topology?", "visual": {"kind": "svg", "path": "mobile-1983.svg", "alt": "Tree structure showing root node with two children, cascading down.", "caption": "Binary tree aggregation reducing root bottleneck."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1985", "title": "A17 Pro Unified Memory Limit", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "If each token in the KV cache requires 1 MB of memory, what is the maximum context length supported?", "chain_ids": ["mobile-chain-bucket-kvcachem-01"], "chain_positions": {"mobile-chain-bucket-kvcachem-01": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1986", "title": "NPU TCM Memory Tiling", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify a memory tiling and scheduling strategy to process this layer locally without spilling intermediate activations to main mobile DRAM?", "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 4}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1989", "title": "Zero-Copy Image Pipeline", "topic": "data-pipeline-engineering", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an evaluation metric for the memory pipeline between the ISP, GPU, and NPU, and propose a unified zero-copy buffer architecture?", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 4}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1990", "title": "Tiered Sensor Gating", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a multi-tiered duty-cycling strategy to extend battery life using the ambient light sensor and ISP motion detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1991", "title": "QAT for Text-to-Speech", "topic": "quantization-fundamentals", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an optimization plan to recover acoustic fidelity without dropping back to FP16, addressing the dynamic range issues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1993", "title": "Voice Translation Pipelining", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you implement an execution pipeline to process overlapping 40ms audio chunks concurrently, keeping per-chunk latency under 50ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1995", "title": "ARKit Priority Queuing Tail Latency", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the tail latency for Hand Tracking under (a) FIFO vs (b) priority-with-preemption queueing, including the OS preemption cost and the post-12-minute thermal throttling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1996", "title": "Diffusion Model Memory Bound Check", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "mobile", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "For one denoising step and the full 20-step inference, compute MACs and use arithmetic intensity to decide compute-bound vs memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1997", "title": "Mobile NPU versus GPU Memory Bandwidth Tradeoffs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Should the A17 Pro Neural Engine or the GPU handle KV cache updates for a 1B parameter INT8 model to maximize throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1998", "title": "Mobile Neural Processing Unit Adaptive Batching Queues", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Model the processing pipeline as an M/D/1 queue and formulate an adaptive batching strategy to keep 99th percentile latency under 500ms when user speech rate unexpectedly doubles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-1999", "title": "SRAM Tiling versus DRAM Fetch on Mobile Chips", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy and latency trade-offs of tiling the feature maps to fit in SRAM versus relying on unified memory DRAM spills for a 1080p frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2002", "title": "Zero-Copy Memory Sharing Between Mobile CPU and NPU", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the overhead of context switching and memory movement between the CPU and NPU, and propose a shared-memory strategy using Android NNAPI?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2005", "title": "Thermally Constrained Background Data Pipelines on Mobile Cores", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you formulate a data pipeline that tokenizes SQLite text messages incrementally without causing thermal throttling or waking up the high-performance CPU cores?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2007", "title": "Mobile FL Checkpoint Wear", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the trade-off between checkpointing frequency and flash storage wear-out for this interrupted mobile training job?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 4}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2013", "title": "Mobile I/O Network Throttling", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Why does waiting for upload ACKs make local A17 Pro inference stagger and drop frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2014", "title": "iOS Memory Pressure Kills", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does synchronous checkpointing of the 100MB model state during the system's memory-warning interrupt lead to corrupted or lost progress?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2016", "title": "Snapdragon Checkpoint Resume", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the resume-time overhead of JSON versus memory-mapped binary checkpoint formats. Identify the binding cost and quantify the resume-time difference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2017", "title": "NPU Batching Jitter", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does enforcing a fixed batch size constraint on the NPU queue exacerbate tail latency (jitter) for streaming real-time audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2018", "title": "LIFO Voice Assistant Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Design a queueing discipline that prioritizes perceived latency upon recovery, and explain its impact on the user experience?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2019", "title": "SRAM Tiling for 4K Video Upscaling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Formulate the caching strategy and calculate the minimum SRAM required to hold one horizontal tile strip allowing for a 3x3 receptive field overlap?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2020", "title": "Mobile NPU Polling Power Drain", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the power inefficiency and calculate the average power consumption of this NPU polling pattern?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2021", "title": "Mobile AR FP16 Memory Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the required memory bandwidth for the model weights alone and calculate what percentage of total system bandwidth this consumes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2022", "title": "Zero-Copy Video Pipeline Bandwidth Saving", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a zero-copy pipeline strategy and calculate the exact memory bandwidth saved per second compared to a deep-copy moving frames into an NPU-specific buffer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2025", "title": "A17 SLC-Resident Mixed-Precision Optimisation", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the mixed-precision allocation that fits the 26 MB SLC budget, honors per-layer precision floors, and maximizes throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2026", "title": "NPU Translation Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Determine the realistic sustained throughput in sentences per second assuming the empirical 65% NPU utilization, and contrast against the 100%-utilization theoretical upper bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2027", "title": "Audio Buffer Downsampling Accumulation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Implement the structural downsampling logic and calculate exactly how many 1024-sample CoreAudio buffers must be accumulated to run one ML inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2028", "title": "Facial Mesh Target FLOPS Budget", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What GFLOPs/s are required for nominal INT8 and mixed-precision fallback, and does fallback still hit 120 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2029", "title": "Identifying Bandwidth Bottlenecks in Mobile Video Processing", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Identify the likely bottleneck in the data pipeline and calculate the required memory bandwidth to sustain 60 FPS for uncompressed 4K (3840x2160, 3 channels, 8-bit) input?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2030", "title": "Designing Hierarchical Wake-Up Pipelines for Mobile NPU", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a duty-cycled pipeline using a low-power CPU threshold to trigger the Neural Engine, quantifying the expected power savings?", "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 3}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2031", "title": "Evaluating Memory Bandwidth in Mobile INT8 Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do FP16 and INT8 per-token decode latency compare on a contended 50GB/s LPDDR5 bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2032", "title": "Analyzing Mobile Thermal Throttling using D/D/1 Queues", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate how the system queue length and frame dropping behavior changes during a 1-second thermal throttling burst?", "chain_ids": ["mobile-chain-auto-secondary-014-25"], "chain_positions": {"mobile-chain-auto-secondary-014-25": 1}, "chain_tiers": {"mobile-chain-auto-secondary-014-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2033", "title": "Understanding Android NNAPI Initialization Overheads", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Describe the NNAPI initialization steps that run on the main thread, identify which step causes the ANR, and propose the threading fix to ensure the UI remains responsive?", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 1}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2034", "title": "Analyzing Thermal and Power Impacts of Mobile Batching", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does duty-cycling 100 images over 2 seconds impact the thermal envelope compared to continuously processing 1.6 images per second?", "chain_ids": ["mobile-chain-auto-secondary-002-02"], "chain_positions": {"mobile-chain-auto-secondary-002-02": 1}, "chain_tiers": {"mobile-chain-auto-secondary-002-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2035", "title": "Understanding Zero-Copy Unified Memory on Apple Silicon", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does zero-copy unified memory avoid CPU-ANE copies, and what alignment conditions cause silent fallbacks to memcpy?", "chain_ids": ["mobile-chain-auto-023-07"], "chain_positions": {"mobile-chain-auto-023-07": 0}, "chain_tiers": {"mobile-chain-auto-023-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2036", "title": "Evaluating JIT Compilation Latency Spikes on Mobile NPU", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the primary cause of this first-run latency spike and propose an architectural solution?", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 4}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2041", "title": "Audio Pipelining to Reduce First-Word Latency", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can you apply a temporal pipelining technique to significantly reduce the user-perceived latency of the first translated word?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2043", "title": "Doubling Battery Life via VAD Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the continuous runtime, and what is the required duty cycle for a 1mW VAD to double the battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2050", "title": "A17 Pro Fine-Tuning Under iOS Background Constraints", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you create an execution architecture that performs this update without violating iOS battery and thermal background constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2053", "title": "Mobile LLM Bandwidth Bound", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum model size (in billions of parameters) to ensure a guaranteed generation speed of 15 tokens per second?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2054", "title": "Evaluate Preprocessing Offload for Thermal Headroom", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How do the trade-offs of migrating preprocessing to the ISP's scaling blocks compare to reducing the initial camera sensor resolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2056", "title": "Triple-Buffered Pipeline for Bluetooth Text Streaming", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a pipelined software architecture that hides the Bluetooth transmission latency and ensures seamless UI text streaming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2057", "title": "Memory Bandwidth Bound for Mobile Autoregression", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the minimum memory bandwidth required to read the weights during generation, and can an Apple A17 Pro support it?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2060", "title": "Hierarchical Wake-ups Using Mobile Coprocessors", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how cascading inference through the low-power motion coprocessor can heavily gate the NE and extend battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2061", "title": "Analyze WakeLock Interaction with Mobile OS Sleep", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how Android's Doze mode reacts to exact repeating WakeLocks, and how aligning inferences to OS maintenance windows prevents this drain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2063", "title": "Design Strict Memory Bound LoRA Fine-Tuning", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "What memory flow architecture permits gradient calculation for LoRA adapters without loading 4GB base weights into mutable app memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2064", "title": "Bursty AR Frame Pipeline Analysis", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does this bursty arrival pattern impact the 99th percentile frame processing latency compared to a perfectly uniform arrival rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2065", "title": "NPU Memory Tiling for Attention", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a memory tiling architecture that partitions the query, key, and value matrices into the NPU's localized SRAM to minimize LPDDR5 fetch requests?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2066", "title": "Hexagon NPU Compute Utilization", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the NPU compute utilization percentage if the pipeline processes a fixed 60 frames per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2067", "title": "Hexagon Audio Batching Energy", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L3", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the energy consumed per minute by batching 5 snippets and waking up every 5 seconds, versus waking up instantly for every snippet?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2068", "title": "Image Resizing Memory Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the absolute minimum system memory read and write bandwidth consumed purely by this resizing step for a single frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2069", "title": "Federated Checkpoint Thermal Impact", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How do frequent heavy writes to non-volatile storage affect the System on Chip (SoC) thermal budget and background task execution?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 3}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2070", "title": "Neural Engine and GPU Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the total end-to-end latency for one frame versus the steady-state pipeline throughput if the ANE and GPU execute asynchronously?", "chain_ids": ["mobile-chain-auto-secondary-016-17"], "chain_positions": {"mobile-chain-auto-secondary-016-17": 0}, "chain_tiers": {"mobile-chain-auto-secondary-016-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2071", "title": "Pedometer Power Calculation", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption of this step-detection cycle?", "chain_ids": ["mobile-chain-auto-secondary-002-01"], "chain_positions": {"mobile-chain-auto-secondary-002-01": 0}, "chain_tiers": {"mobile-chain-auto-secondary-002-01": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2074", "title": "Estimating Maximum Frame Rate on A17 Pro Neural Engine", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Assuming 40% utilization of the peak TOPS, what is the maximum frame rate the NPU can sustain?", "chain_ids": ["mobile-chain-auto-secondary-004-06"], "chain_positions": {"mobile-chain-auto-secondary-004-06": 2}, "chain_tiers": {"mobile-chain-auto-secondary-004-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2075", "title": "CPU versus NPU Image Format Conversion Performance", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain the performance impact of performing the YUV-to-RGB conversion and planar transposition on the CPU versus an image signal processor (ISP) or NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2076", "title": "Optimal Checkpointing Frequency for Mobile Federated Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Design a local checkpointing frequency that limits lost training progress to at most 20 seconds, given saving the model takes 50ms of flash write time?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 0}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Every 1 second", "Every 20 seconds", "Every 200 seconds", "Only when the user manually saves"], "correct_index": 1}}, {"id": "mobile-2077", "title": "Unified Memory Contention Between NPU and GPU Execution", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why aggressively pipelining the NPU inference and GPU rendering might cause frame drops despite sufficient theoretical compute capacity for both?", "chain_ids": ["mobile-chain-auto-secondary-016-17"], "chain_positions": {"mobile-chain-auto-secondary-016-17": 1}, "chain_tiers": {"mobile-chain-auto-secondary-016-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2078", "title": "Pipelining Model Weights Loading in Mobile Voice Translation", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L1", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how to deploy both models to minimize the 300ms total loading latency during a real-time conversation?", "chain_ids": ["mobile-chain-bucket-modelser-03"], "chain_positions": {"mobile-chain-bucket-modelser-03": 0}, "chain_tiers": {"mobile-chain-bucket-modelser-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2079", "title": "LLM KV Cache Impact on Unified Memory System Cache", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does the unified memory architecture impact the system level cache (SLC) hit rate and overall memory power during a long sequence generation?", "chain_ids": ["mobile-chain-auto-014-19"], "chain_positions": {"mobile-chain-auto-014-19": 0}, "chain_tiers": {"mobile-chain-auto-014-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2081", "title": "Pipelining Autoregressive Token Generation on Snapdragon NPU", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "mobile", "level": "L2", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum token generation rate if weight loading and compute are perfectly pipelined?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2083", "title": "Wake-up Energy Penalty in NPU Micro-power Duty Cycling", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the energy consumed to process a 10 ms batch of data from sleep, through wake-up, to completion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2084", "title": "Overlapping Compute and Network on Snapdragon", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "mobile", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How can the mobile client overlap the transmission of layer N's activations with the computation of layer N+1 or subsequent frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2087", "title": "Hardware Image Scaling for Mobile NPU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do you optimize the data pipeline to downsample the image without burdening the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2088", "title": "Compute Latency Under Thermal Burst Limits", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the expected inference time and thermal implications if the NPU can sustain 70% utilization before throttling after 2 seconds?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2089", "title": "Evaluating Pipeline Sync Barriers on Snapdragons", "topic": "communication-computation-overlap", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the pipeline's overall throughput and bottleneck if a strict memory synchronization barrier is added between every frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2090", "title": "Designing Continuous Batching for Shared NPUs", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "How would you design a serving scheduler that maximizes NPU utilization and minimizes context switching overhead for heterogeneous batch sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2091", "title": "Calculating LLM Token Rates by Memory Bandwidth", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L6+", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the maximum theoretical token generation rate for batch size 1 under shared memory constraints, assuming 80 GB/s total bandwidth and 20 GB/s for display?", "chain_ids": ["mobile-chain-auto-014-18"], "chain_positions": {"mobile-chain-auto-014-18": 3}, "chain_tiers": {"mobile-chain-auto-014-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2092", "title": "iOS Camera Pipeline Resizing for Core ML", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L5", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How should the iOS camera pipeline resize 4K frames to 224x224 while minimizing CPU copies?", "chain_ids": ["mobile-chain-auto-023-08"], "chain_positions": {"mobile-chain-auto-023-08": 3}, "chain_tiers": {"mobile-chain-auto-023-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2094", "title": "Analyzing Latency of Cold-Swapping NPU SRAM", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the latency implications of cold-swapping these models into the NPU's SRAM versus co-residing them in main memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2096", "title": "Applying iOS Jetsam Footprint Limits to LLMs", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How do memory management principles explain what happens when the app footprint approaches system limits?", "chain_ids": ["mobile-chain-auto-014-15"], "chain_positions": {"mobile-chain-auto-014-15": 1}, "chain_tiers": {"mobile-chain-auto-014-15": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2097", "title": "Multimodal Pipeline Interleaving", "topic": "data-pipeline-engineering", "competency_area": "optimization", "track": "mobile", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an optimized zero-copy data pipeline architecture that interleaves visual and audio ML workloads without hitting memory bandwidth bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2098", "title": "Transformer INT8 Outliers", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the activation distributions to pinpoint the root cause of the accuracy degradation and propose a hardware-compatible mitigation?", "chain_ids": ["mobile-chain-auto-017-04"], "chain_positions": {"mobile-chain-auto-017-04": 2}, "chain_tiers": {"mobile-chain-auto-017-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2099", "title": "AR Throttling Queue Control", "topic": "queueing-theory", "competency_area": "latency", "track": "mobile", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a dynamic queue management algorithm to maintain real-time UI responsiveness and prevent queue explosion during thermal throttling events?", "chain_ids": ["mobile-chain-auto-secondary-014-25"], "chain_positions": {"mobile-chain-auto-secondary-014-25": 2}, "chain_tiers": {"mobile-chain-auto-secondary-014-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2100", "title": "4K Image Memory Tiling", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify the memory allocation sequence and tile sizes to process the 4K image without thrashing main memory?", "chain_ids": ["mobile-chain-auto-014-16"], "chain_positions": {"mobile-chain-auto-014-16": 1}, "chain_tiers": {"mobile-chain-auto-014-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2103", "title": "Uncompressed Frame Bandwidth", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "mobile", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the memory bandwidth required just to transport the converted RGB frames to the NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2105", "title": "NPU Audio Duty Cycle", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "If the NPU wakes up for 200ms every 1 second, calculate the average power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2106", "title": "Symmetric vs Asymmetric MACs", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "mobile", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the impact of asymmetric vs symmetric quantization on the latency of the model's depthwise convolutional layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2108", "title": "A17 Super-Resolution Compute", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "mobile", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the theoretical minimum execution time for a single image, isolating purely computational limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2109", "title": "Checkpoint Frequency vs OOM Recovery in Federated Learning", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-off between checkpointing frequency and OOM recovery overhead to minimize total wasted training time?", "chain_ids": ["mobile-chain-auto-026-06"], "chain_positions": {"mobile-chain-auto-026-06": 2}, "chain_tiers": {"mobile-chain-auto-026-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2111", "title": "Always-On Gaze Detection", "topic": "duty-cycling", "competency_area": "power", "track": "mobile", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the maximum supported frame rate to keep average NPU power strictly under 5mW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2119", "title": "PagedAttention Fragmentation on Mobile", "topic": "kv-cache-management", "competency_area": "cross-cutting", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the FP16 KV cache size per page, and what is the memory fragmentation overhead during a typical 15-token generation?", "chain_ids": ["mobile-chain-bucket-kvcachem-05"], "chain_positions": {"mobile-chain-bucket-kvcachem-05": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2121", "title": "Hexagon-Adreno Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the pipeline bubble and end-to-end latency if the NPU stage takes 12ms, the GPU takes 8ms, and the synchronization barrier over shared memory takes 2ms?", "chain_ids": ["mobile-chain-auto-secondary-017-47"], "chain_positions": {"mobile-chain-auto-secondary-017-47": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-47": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2123", "title": "Wi-Fi Direct MAC Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the pipeline latency per token, factoring in a 4ms fixed MAC contention overhead per transmission?", "chain_ids": ["mobile-chain-auto-secondary-017-48"], "chain_positions": {"mobile-chain-auto-secondary-017-48": 0}, "chain_tiers": {"mobile-chain-auto-secondary-017-48": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2125", "title": "Wearable SPI DMA Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Determine the pipeline efficiency if the SPI DMA setup takes an additional 1ms synchronization barrier per frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2126", "title": "Video SR Memory Barrier", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the first-frame latency penalty caused by a 5ms synchronization barrier?", "chain_ids": ["mobile-chain-auto-secondary-017-47"], "chain_positions": {"mobile-chain-auto-secondary-017-47": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-47": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2127", "title": "BLE-Interval Scheduling for AR Pipelines", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a scheduling strategy to minimize the pipeline bubble given the strict network constraints?", "chain_ids": ["mobile-chain-auto-secondary-017-48"], "chain_positions": {"mobile-chain-auto-secondary-017-48": 1}, "chain_tiers": {"mobile-chain-auto-secondary-017-48": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2128", "title": "Drone Mesh Pipelining", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a micro-batch pipeline schedule that masks the 30ms mesh multi-hop synchronization delay, ensuring GPU utilization remains >80%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2130", "title": "AirDrop Half-Duplex Sync", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the total time required for the AllGather, incorporating a 20ms session negotiation latency and Wi-Fi Direct's half-duplex constraint?", "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 1}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2131", "title": "Automotive Ethernet AllGather", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What simplified bandwidth-bound latency estimates 3-node ring AllGather for 8MB per node over Gigabit Ethernet with 50us hops and a 1ms barrier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2132", "title": "5G Hierarchical Tree FL", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the effective bandwidth and total tree-reduce time per round, factoring in a 500ms synchronization wait for the 95th percentile straggler?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2133", "title": "LoRa Swarm Gossip", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the lower-bound time to move one 2MB update across a 3-hop LoRa mesh with 200ms CSMA/CA backoff per hop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2134", "title": "MoE AllToAll Imbalance", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the AllToAll latency for a 5MB batch of tokens, given an uneven expert load (75% tokens to node A, 25% to node B) and a 0.5ms TCP setup barrier?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2136", "title": "BLE Gossip Quantization", "topic": "collective-communication", "competency_area": "networking", "track": "mobile", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose an asynchronous quantization scheme that prevents the 30ms BLE connection interval from bottlenecking the 2Hz update requirement?", "chain_ids": ["mobile-chain-auto-secondary-005-13"], "chain_positions": {"mobile-chain-auto-secondary-005-13": 3}, "chain_tiers": {"mobile-chain-auto-secondary-005-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2137", "title": "Mobile KV Cache Capacity Before Spill", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What sequence length fits in the 4GB KV-cache budget before any flash spill, given 128KB KV per token?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2138", "title": "NPU DMA Ring Buffer KV", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What memory bandwidth and DMA overhead are consumed by reading a 128MB KV cache for a 1024-token context at 10 tok/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2139", "title": "KV Cache Block Sizing for Edge Inference", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Design a block sizing strategy to minimize the 5us synchronization overhead per block fetch while keeping memory fragmentation under 10% for a 500-token chat?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2140", "title": "Wi-Fi 6 MAC Bubble Inference", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How would you evaluate the network stall time induced by sending a 2MB KV cache update per generation step, accounting for Wi-Fi 6's 3ms MAC layer synchronization bubble?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2141", "title": "Wearable BLE KV Offload", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the maximum allowable context window before the 15ms BLE connection synchronization latency and transfer time exceed the 100ms real-time audio budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2142", "title": "A17 Sliding Window Sync", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the effective memory bandwidth utilization if rolling the 128MB KV cache requires a 500us synchronization barrier between the CPU and NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2143", "title": "Orin RDMA Cache Lock", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can a distributed KV cache allocation protocol mitigate the 10us RDMA lock synchronization cost when multiple Orins append to the same sequence?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2144", "title": "5G Remote KV Prefetching", "topic": "kv-cache-management", "competency_area": "architecture", "track": "mobile", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What speculative prefetching mechanism can hide remote KV block fetch latency over 5G?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2145", "title": "Mobile mmap: Loading Models from Flash Storage", "topic": "memory-mapped-inference", "competency_area": "memory", "track": "mobile", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 2GB INT8 mmap model take 4s on first inference from UFS 3.1, and how should it be warmed?", "chain_ids": ["mobile-chain-auto-secondary-014-05"], "chain_positions": {"mobile-chain-auto-secondary-014-05": 0}, "chain_tiers": {"mobile-chain-auto-secondary-014-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2147", "title": "Model Format Conversion: Sizing the FP16 CoreML Payload", "topic": "model-format-conversion", "competency_area": "deployment", "track": "mobile", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How does the FP16 conversion mathematically impact the model's storage footprint, and what is the expected payload size of the resulting CoreML model?", "chain_ids": ["mobile-chain-auto-001-01"], "chain_positions": {"mobile-chain-auto-001-01": 1}, "chain_tiers": {"mobile-chain-auto-001-01": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2151", "title": "KV-Cache: Diagnose Latency Spikes in Dynamic Paged Allocation", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do these severe latency spikes and OOM kills occur at allocation boundaries despite available memory, and what mobile OS dynamic drives this behavior?", "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2152", "title": "Android SoC NPU KV Cache Size Estimation", "topic": "kv-cache-management", "competency_area": "memory", "track": "mobile", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you calculate the maximum number of tokens the KV cache can hold before exhausting the remaining shared memory?", "chain_ids": ["mobile-chain-bucket-kvcachem-02"], "chain_positions": {"mobile-chain-bucket-kvcachem-02": 0}, "chain_tiers": {"mobile-chain-bucket-kvcachem-02": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2154", "title": "The Infotainment Traffic Jam", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "mobile", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the architectural root cause of these unpredictable latency spikes, and what system-level mitigations would you deploy to guarantee the SLA?", "chain_ids": ["mobile-chain-bucket-modelser-02"], "chain_positions": {"mobile-chain-bucket-modelser-02": 2}, "chain_tiers": {"mobile-chain-bucket-modelser-02": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2157", "title": "The Depthwise Cache Collapse", "topic": "roofline-analysis", "competency_area": "compute", "track": "mobile", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does depthwise convolution lose L1 reuse and fall into the memory-bound region of the Roofline model?", "chain_ids": ["mobile-chain-bucket-roofline-02"], "chain_positions": {"mobile-chain-bucket-roofline-02": 2}, "chain_tiers": {"mobile-chain-bucket-roofline-02": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "mobile-2161", "title": "Race-to-Sleep vs. Paced Execution for Mobile LLMs", "topic": "power-budgeting", "competency_area": "power", "track": "mobile", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which power management strategy yields a lower overall thermal load for this continuous generative workload, and how do Dynamic Voltage and Frequency Scaling (DVFS) principles justify your choice?", "chain_ids": ["mobile-chain-bucket-powerbud-01"], "chain_positions": {"mobile-chain-bucket-powerbud-01": 3}, "chain_tiers": {"mobile-chain-bucket-powerbud-01": "primary"}, "human_reviewed": {"status": "verified", "by": "vj", "date": "2026-05-02T14:47:29+00:00"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0001", "title": "The Memory Collision", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the SRAM layout, how is your 'stable' tensor arena being corrupted?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0002", "title": "The RF Energy Sink", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the RF breakdown, why is BLE consuming so much more than a simple packet transmission?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0003", "title": "Accuracy Collapse After Unstructured Magnitude Pruning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can aggressive unstructured pruning destroy the model's gesture decision boundaries?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0004", "title": "The CPU Cycle Thief", "topic": "real-time-deadlines", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do five blocking 64-byte reads on a 100 kHz I2C bus steal enough cycles to break a 40 ms KWS model with a 50 ms deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 3}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0005", "title": "The Arena Swiss Cheese", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the arena map, why can't the system satisfy a 30 KB request?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0006", "title": "The Jitter Storm", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can an 8 ms BLE event make a 23 ms gesture task miss a 25.6 ms hard deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0007", "title": "The Ghost Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Why do shared-crystal audio and accelerometer streams drift apart over long recordings?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0008", "title": "The Observation Gap", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a forest device listening 5 seconds per minute miss over 90% of random 1-second bird calls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0009", "title": "The SRAM Weight Corruption", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "If the audio buffer starts at 0x20000000 and SRAM model weights start at 0x20010000, how did the overflow corrupt the model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0010", "title": "The SPI Bus Latency Choke", "topic": "extreme-quantization", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does XIP from external serial flash make a 240 MHz CPU run inference 10x slower than simulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0016", "title": "The Wildlife Camera's Power Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is its average power consumption?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~0.083 mW", "~25.0 mW", "~10.1 mW", "~0.093 mW"], "correct_index": 3}}, {"id": "tinyml-0021", "title": "The OTA Flash Memory Tax", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much flash must be reserved specifically for the OTA update partition?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~50 KB, for storing a small patch file.", "Effectively 0 KB, as you can overwrite the existing application in-place.", "~500 KB, to hold a complete second copy of the application binary.", "~32 KB, the space taken by the bootloader itself."], "correct_index": 2}}, {"id": "tinyml-0023", "title": "The Duty Cycle Power Drain", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What average power does a Cortex-M4 wake-word sensor draw when active 1 s and asleep 9 s per cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10 mW", "~5 mW", "~1 mW", "~10 µW"], "correct_index": 2}}, {"id": "tinyml-0027", "title": "The Deep Sleep Power Chasm", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate ratio of power consumption for a typical microcontroller in an active state versus a deep sleep state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100×", "~1,000×", "~10,000×", "~1,000,000×"], "correct_index": 2}}, {"id": "tinyml-0028", "title": "The Remote Wildlife Camera's Lifespan", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Given the Cortex-M4's active power consumption is 50mW and its deep sleep power is 10µW, approximately how long will the battery last?", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 1}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~12 days", "~60 days", "~1,800 days", "~5,500 days"], "correct_index": 2}}, {"id": "tinyml-0037", "title": "The Wildlife Camera's Battery Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the operational lifetime of the device and what is that lifetime?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 1.2 days", "About 20 days", "About 73 days", "About 2.5 days"], "correct_index": 2}}, {"id": "tinyml-0038", "title": "The Keyword Spotter's Battery Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power and battery life result from checking 0.5-second audio clips every 5 seconds on a 720 mWh battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~18 hours", "~36 hours", "~7.5 days", "~2.5 days"], "correct_index": 2}}, {"id": "tinyml-0040", "title": "The Birdwatcher's Power Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power comes from 1 s active at 10 mW and 59 s sleep at 10 uW on a Cortex-M4 bird sensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10.0 mW", "0.59 mW", "0.177 mW", "0.010 mW"], "correct_index": 2}}, {"id": "tinyml-0041", "title": "The Solar-Powered Wildlife Cam", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the average power for a Cortex-M4 wildlife camera active for 200ms once per minute and asleep otherwise?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10.13 mW (Misinterprets µW as mW)", "~25.0 mW (Averages power without time weighting)", "~0.167 mW (Ignores sleep power in calculation)", "~0.177 mW"], "correct_index": 3}}, {"id": "tinyml-0044", "title": "The Battery-Powered Birdwatcher", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many days will an 888 mWh battery last when a Cortex-M4 wildlife camera wakes 6 times per hour for 1 s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~18 hours", "~4 days", "~397 days", "~444 days"], "correct_index": 2}}, {"id": "tinyml-0046", "title": "The Energy-Neutral Wildlife Camera", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum active time per 60-second cycle for a 1 mW solar-powered camera to remain energy-neutral?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 0}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1.20 seconds", "~58.81 seconds", "~1.19 seconds", "0.012 seconds"], "correct_index": 2}}, {"id": "tinyml-0047", "title": "The Wildlife Camera's Power Budget: Duty Cycling & Energy Harvesting", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you compute average power for a wildlife camera from its active and deep-sleep states?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.67 mW", "~9.7 mW", "~1.34 mW", "~25 mW"], "correct_index": 2}}, {"id": "tinyml-0048", "title": "The Wildlife Sensor's Power Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power does the wildlife sensor draw with 2 s active at 40 mW and 18 s sleep at 10 µW?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~13.0 mW", "~4.01 mW", "~4.90 mW", "~4.00 mW"], "correct_index": 1}}, {"id": "tinyml-0050", "title": "The Duty Cycle Constraint", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the correct term for the percentage of time the CPU is in an active state processing audio versus in a low-power sleep state?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Latency", "Throughput", "Duty Cycle", "Power Draw"], "correct_index": 2}}, {"id": "tinyml-0052", "title": "The Bird-Call Battery Drain", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power does the bird-call sensor draw with 2 s at 50 mW and 58 s at 10 µW each minute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.67 mW", "~11.33 mW", "~1.68 mW", "~100.6 mW"], "correct_index": 2}}, {"id": "tinyml-0053", "title": "The FOTA Update Risk", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should 10,000 nodes receive a 200 KB model over 250 B/s LoRaWAN without bricking on failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0054", "title": "Model Versioning on MCU", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How can a fleet of RP2040 sensors expose ML model versions remotely without an underlying OS or filesystem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0056", "title": "BLE Throughput for Model Update", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long and how much battery does a 150 KB BLE 5.0 model update cost on a 100 mAh wearable?", "chain_ids": ["tinyml-chain-auto-secondary-004-11"], "chain_positions": {"tinyml-chain-auto-secondary-004-11": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0058", "title": "The Offline Drift Detector", "topic": "monitoring-observability", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you detect and handle model drift on a device with 256 KB SRAM and no internet?", "chain_ids": ["tinyml-chain-auto-secondary-017-67"], "chain_positions": {"tinyml-chain-auto-secondary-017-67": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-67": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0059", "title": "FOTA Update Integrity Verification", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is verifying the binary hash insufficient for ML models, and how do you implement functional model attestation (inference on a golden test input) to prove the model's math is intact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0061", "title": "Inference Result Compression for Upload", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compress daily RP2040 bird-classifier uploads to cut a 1,000-device cellular bill from about $13,000/month?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0062", "title": "Bootloader A/B Firmware Partitioning", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the flash memory layout for a 1 MB flash footprint to support A/B firmware partitioning with rollback?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0063", "title": "Fleet-Wide Model Update Strategy", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should 100,000 sensors across 5 MCU variants and BLE, LoRaWAN, and LTE-M receive a retrained model update?", "chain_ids": ["tinyml-chain-auto-secondary-004-11"], "chain_positions": {"tinyml-chain-auto-secondary-004-11": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0064", "title": "Hardware-in-the-Loop Testing", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many HIL boards test 5 Cortex-M variants in CI, and what wall time does each commit need?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0068", "title": "The OTA Update Brickening", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why did a 20 KB tensor-arena increase brick 10% of OTA-updated sensors despite sufficient total free memory and dual partitions?", "chain_ids": ["tinyml-chain-auto-secondary-004-12"], "chain_positions": {"tinyml-chain-auto-secondary-004-12": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0071", "title": "The MCU Model Extraction Attack", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you architect defense-in-depth security to protect the model on a constrained STM32F4 MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0072", "title": "Power Side-Channel Weight Extraction", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can power side-channel analysis extract model weights by correlating power traces with MAC operations, and why does the model's arithmetic structure make this ML-specific attack possible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0073", "title": "Secure Boot Chain for ML Models", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an STM32U5 TrustZone wearable authenticate both firmware and cardiac model weights at boot?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 2}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0076", "title": "The Continuous Logging Flash Death", "topic": "vram-budgeting", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How did logging 20 bytes every 5 minutes destroy the Flash?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0079", "title": "The Audio Buffer Memory Footprint", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "If you're sampling at a standard 16 kHz with a 16-bit depth, what is the approximate size of the raw audio buffer you need to allocate in SRAM just to hold one clip for processing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.2 KB", "320 bytes", "32 KB", "320 KB"], "correct_index": 2}}, {"id": "tinyml-0080", "title": "The Sensor Data Ingestion Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How much SRAM is required to buffer exactly 1 second of 16 kHz, 16-bit mono audio for TinyML wake-word inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 KB", "64 KB", "32 KB", "256 KB"], "correct_index": 2}}, {"id": "tinyml-0081", "title": "The Keyword Spotting Latency Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you budget a 500ms keyword-spotting response deadline across audio capture, feature extraction, inference, and action?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0082", "title": "The BLE Disconnect During OTA", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does a 200 KB ML model size determine BLE OTA transfer time, and why is it riskier than firmware without incremental checksums?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0084", "title": "Watchdog Timers and Hard Real-Time Guarantees", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why must a watchdog timeout be based on worst-case ML execution time rather than average inference latency?", "chain_ids": ["tinyml-chain-auto-secondary-008-06"], "chain_positions": {"tinyml-chain-auto-secondary-008-06": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0086", "title": "The DMA Ping-Pong Desync", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a 1024-sample I2S ping-pong buffer corrupt audio when inference takes 65 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0087", "title": "Cortex-M4F Lazy FPU Stacking in ISR", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does converting ADC values to float inside a 1 kHz Cortex-M4F ISR spike RTOS latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0088", "title": "The Sensor Pipeline Without Drops", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can a 1 kHz vibration pipeline process 256-sample windows with 50 ms inference without dropping samples?", "chain_ids": ["tinyml-chain-auto-secondary-003-13"], "chain_positions": {"tinyml-chain-auto-secondary-003-13": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0092", "title": "The I2C Bus Lockup", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How can a long ML inference blocking the I2C ISR stall the bus, and how should layer timing set the I2C timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0094", "title": "ESP32 Wi-Fi Supply Noise Coupling Into ADC", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the ADC introduce broadband noise above 2 kHz, and how does it drop the F1-score to 0.71?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0097", "title": "Side-Channel Attacks on MCU Inference", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can an attacker extract a Cortex-M4 model utilizing Differential Power Analysis (DPA) despite RDP Level 2 flash protection?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 1}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0098", "title": "Power Profiling for MCU Inference", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What did you miss in your datasheet-based estimate, and how do you build an accurate power profile?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0099", "title": "Power Profiling Methodology", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a measurement setup to get the real power profile, and what did the team likely miss?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0100", "title": "Watchdog Timer Integration with Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an 85 ms nRF5340 inference randomly trip a 100 ms watchdog, and where should the watchdog be kicked?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 2}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0104", "title": "The UART Buffer Overrun", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a UART stream at 115200 baud corrupt JSON when a cellular modem stalls for a few seconds?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0105", "title": "Vibration-Based Predictive Maintenance", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a vibration-based predictive maintenance system for 500 industrial motors using Cortex-M4 sensors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0107", "title": "Sensor Aging Changes the Baseline — Detecting and Adapting On-Device", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you detect this is sensor drift (not real anomalies) and adapt on-device without retraining?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0108", "title": "The Hardware Crypto Engine Latency", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an hourly TLS handshake freeze the CPU and cause a Cortex-M4 with hardware AES to drop 20 ms audio frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0110", "title": "The Heterogeneous MCU Scheduling Problem", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "If both models must complete once per sampling cycle and may run in parallel, which i.MX RT1170 core should run the 2M-MAC vibration classifier and which should run the 180 KB-state temperature LSTM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0111", "title": "The Interrupt Latency Impact", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you calculate the effective inference time under interrupt load, and at what sensor sampling rate does the system miss the 30ms inference deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 4}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0112", "title": "The Power Supply Noise Impact", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do 50 mV regulator ripple and 30 mV inference droop degrade a 12-bit gas-sensor ADC's SNR?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0115", "title": "The Sleep Mode Wi-Fi Disconnect", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can an ESP32 in Light Sleep keep Wi-Fi powered yet still suffer a 3 s reconnect after motion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0117", "title": "Flash Erase Suspend and Motor-Control Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 300 ms SPI flash erase be safely suspended every 5 ms for a motor-control interrupt?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0118", "title": "Solar + Supercapacitor + MCU System Design", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you bridge the 300x gap between harvest rate and compute demand to run inferences using a supercapacitor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0119", "title": "Keyword Inference Rate on Derated 0.5 mW Solar", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Including the required 30% seasonal safety margin, how many 15 ms Ambiq Apollo4 keyword inferences per hour can a 0.5 mW solar cell sustain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0120", "title": "On-Device Vibration Anomaly Detection for Motor Bearings", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the anomaly detection pipeline, including feature extraction, model architecture, and the threshold calibration strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0121", "title": "Power-Aware Inference Scheduler", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a power-aware scheduler that meets all real-time deadlines while maximizing battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0123", "title": "The Sub-Millisecond Fault Detector", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a sub-1 ms Cortex-M4 vibration fault detector for a 10,000 RPM motor?", "chain_ids": ["tinyml-chain-auto-secondary-003-13"], "chain_positions": {"tinyml-chain-auto-secondary-003-13": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0124", "title": "The Solar Harvesting Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "When can you run inference within the power budget, and how do you handle cloudy days?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 3}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0125", "title": "Always-On Multi-Modal Sensor Fusion System", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the full sensor fusion architecture, specifying which sensors are always-on vs triggered, the fusion model, and the power budget?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0128", "title": "The Battery Life Equation", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long will a 300 mAh 3.0V coin cell power a Cortex-M4 gesture model running 30 ms once per second?", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 2}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0129", "title": "The Sleep Mode Wake-Up Cost", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an nRF52840 sound classifier average 300 µA when System OFF is only 0.3 µA and inference is 25 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0130", "title": "The Energy Harvesting Wall", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the maximum inference rate you can sustain indefinitely without a battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0131", "title": "The Voltage Scaling Tightrope", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why do 3% of STM32L5 wildlife sensors misclassify after dropping from 1.2 V to 0.9 V at 26 MHz?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0134", "title": "The Flash Page Erase Block", "topic": "real-time-deadlines", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 20-byte LittleFS log write to SPI flash stall for 500 ms and drop 10 audio frames?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0135", "title": "Hard-Real-Time STM32H7 Camera Inspection Schedule", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you schedule camera DMA, preprocessing, inference, and GPIO actuation within a 200 ms conveyor window to guarantee zero missed products?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0136", "title": "MCU-Based Edge AI Gateway", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can the ESP32-S3 handle this workload, and what are the critical system bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0137", "title": "The 4x Integer Speedup", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When using the Arm CMSIS-NN library, what is the theoretical throughput gain for 8-bit integer operations that can be fully parallelized, compared to a naive C implementation?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 0}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1x (no speedup without a hardware FPU)", "2x (assuming only dual-MAC instructions apply)", "4x (packing four 8-bit integers into a 32-bit register)", "32x (confusing register width with SIMD throughput)"], "correct_index": 2}}, {"id": "tinyml-0138", "title": "The Flash vs. SRAM Divide", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Where are the convolutional filter values primarily located, and where is the tensor arena for calculating activations allocated?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 0}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Both the weights and the tensor arena are loaded into SRAM.", "The weights are loaded into SRAM, and the tensor arena is allocated in Flash.", "The weights are stored in Flash, and the tensor arena is allocated in SRAM.", "Both the weights and the tensor arena are allocated in Flash memory."], "correct_index": 2}}, {"id": "tinyml-0139", "title": "The Requantization Shift", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the final arithmetic operation needed to produce the final INT8 value, q_out, before saturation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Subtract the output zero-point (`Z_out`) (Yields 60 instead of 40 in example).", "No operation is needed, just cast to INT8 (Yields 50 instead of 40).", "Add the output zero-point (`Z_out`).", "Multiply by the output zero-point (`Z_out`) (Causes overflow/clipping)."], "correct_index": 2}}, {"id": "tinyml-0140", "title": "The Depthwise Separable Cost Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What parameter-count reduction should you expect from replacing a standard 3x3 convolution with a 3x3 depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2-3x reduction", "Roughly 8-9x reduction", "Roughly 50-100x reduction", "No reduction in parameters, only in FLOPs"], "correct_index": 1}}, {"id": "tinyml-0141", "title": "The 1 Millisecond Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical total latency budget for an interrupt-driven audio pipeline in a TinyML context?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 0}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "1 ms", "50 ns"], "correct_index": 2}}, {"id": "tinyml-0144", "title": "The Nanosecond Heist", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What operation timescale must a power-analysis attacker resolve to distinguish Cortex-M4 flash reads or individual instructions?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 0}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Milliseconds (ms)", "Microseconds (μs)", "Nanoseconds (ns)", "Picoseconds (ps)"], "correct_index": 2}}, {"id": "tinyml-0147", "title": "The SRAM Overflow Trap", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total SRAM requirement and explain why the crash is happening?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model fits. The 90 KB tensor arena is smaller than the 128 KB of available SRAM.", "The model is too large. The 250 KB model file must be loaded from Flash into the 128 KB SRAM, which is impossible.", "The device is out of SRAM. The required 138 KB (90 KB arena + 48 KB system) exceeds the 128 KB available. The model's 250 KB file size is for Flash storage, not runtime RAM.", "The device is out of Flash. The 250 KB model and 48 KB system SRAM don't leave enough space in the 512 KB of Flash for the OS."], "correct_index": 2}}, {"id": "tinyml-0149", "title": "The Flash vs. SRAM Budget", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should you separately budget Flash for code/weights/OTA storage versus SRAM for runtime data?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 1}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it doesn't fit because SRAM (128 KB) + model weights (400 KB) = 528 KB > 256 KB.", "Yes, it fits. Flash usage is 946 KB and SRAM usage is 128 KB.", "No, it doesn't fit because Flash usage is 400+64+32 = 496 KB, making it impossible to add OTA.", "No, it doesn't fit because total memory is 946 + 128 = 1074 KB > 1024 KB."], "correct_index": 1}}, {"id": "tinyml-0151", "title": "The Cost of Unoptimized C Code", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How long should a 5M-MAC KWS inference take on a 168 MHz Cortex-M4 when unoptimized INT8 C costs 4 cycles per MAC?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 1}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~30 ms", "~60 ms", "~119 ms", "~238 ms"], "correct_index": 2}}, {"id": "tinyml-0152", "title": "The CMSIS-NN SIMD Dividend", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the new, optimized inference time on the same 168 MHz MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~119.0 ms (Assuming 4 cycles per MAC without SIMD)", "~59.5 ms (Assuming 2 cycles per MAC)", "~29.8 ms", "~7.4 ms (Assuming an impossible 4 MACs per cycle on INT8)"], "correct_index": 2}}, {"id": "tinyml-0153", "title": "The Real-Time MAC Budget", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the maximum MAC count for a Cortex-M4 voice model that must finish within 33 ms at 168 MHz and 1 MAC per cycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.8 Million MACs", "~5.1 Million MACs", "~5.5 Million MACs", "~168 Million MACs"], "correct_index": 2}}, {"id": "tinyml-0155", "title": "The Cortex-M7 MAC Budget", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you compare the model's computational needs to the MCU's capabilities and explain if the MCU can handle the load?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, easily. The MCU has 480M cycles and the model only needs 9M.", "Yes, with about 50% headroom.", "No, the required 450 MMACs/sec is too close to the MCU's 480 MMACs/sec peak.", "No, it requires 900 MMACs/sec, which is double the MCU's capability."], "correct_index": 2}}, {"id": "tinyml-0157", "title": "The Great Flash/SRAM Divide", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Where should read-only quantized weights and the read-write Tensor Arena reside on a 1MB Flash, 256KB SRAM microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Weights are stored in SRAM for speed; the Tensor Arena is in Flash.", "Both the weights and the Tensor Arena are placed in SRAM.", "Weights are stored in Flash; the Tensor Arena is allocated in SRAM.", "Both the weights and the Tensor Arena are placed in Flash to save SRAM."], "correct_index": 2}}, {"id": "tinyml-0159", "title": "The Whole-Graph Arena Plan", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How do you compute the minimum tensor arena size for a two-op TFLM model with buffer reuse and runtime tail overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["75 KB", "145 KB", "85 KB", "80 KB"], "correct_index": 2}}, {"id": "tinyml-0160", "title": "The 1-Millisecond Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical total latency budget you have, from the interrupt firing to classifying the event, to meet a hard real-time deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["16 ms", "33 ms", "1 ms", "100 µs"], "correct_index": 2}}, {"id": "tinyml-0163", "title": "The MCU Performance Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is a typical Cortex-M4 ML model more likely compute-bound or SRAM-bandwidth-bound, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It is memory-bound; fetching weights from SRAM is the bottleneck.", "It is compute-bound; the processor's calculation speed is the bottleneck.", "It is I/O-bound; the SPI bus for sensor data is the bottleneck.", "The compute and memory are perfectly balanced."], "correct_index": 1}}, {"id": "tinyml-0164", "title": "Microcontroller Arithmetic Intensity", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Based on your calculation, are these devices generally compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.57 FLOPs/Byte (Memory-bound)", "~280 FLOPs/Byte (Compute-bound)", "~0.5 FLOPs/Byte (Compute-bound)", "~0.25 FLOPs/Byte (Memory-bound)"], "correct_index": 2}}, {"id": "tinyml-0165", "title": "The 10mW Power Budget", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What compute rate can a 10 mW solar budget sustain on a 20 GOPS/W accelerator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["200 GOPS", "2 GOPS", "0.2 GOPS (or 200 MOPS)", "0.002 GOPS (or 2 MOPS)"], "correct_index": 2}}, {"id": "tinyml-0166", "title": "The SRAM Budget Constraint", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical on-chip SRAM size you can expect to work with for the Tensor Arena?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 0}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 GB", "~2 MB", "~256 KB", "~50 mW"], "correct_index": 2}}, {"id": "tinyml-0169", "title": "The DMA Dividend", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many CPU cycles does DMA save when capturing 1 second of 16 kHz 16-bit audio versus PIO at 10 cycles per sample?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 0}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["PIO: 32,000 cycles, DMA: 200 cycles", "PIO: 16,000 cycles, DMA: 200 cycles", "PIO: 160,000 cycles, DMA: 200 cycles", "PIO: 1,600,000 cycles, DMA: 200 cycles"], "correct_index": 2}}, {"id": "tinyml-0171", "title": "The Depthwise Separable Compute Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately what is the computational savings factor you should expect when replacing a standard 3x3 convolution with a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3x", "~9x", "~64x (the number of input channels)", "Over 100x"], "correct_index": 1}}, {"id": "tinyml-0173", "title": "The 1ms Interrupt Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hard real-time latency budget should the Cortex-M4 audio DMA ISR meet to avoid missing packets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 ms", "~33 ms", "~16 ms", "~1 ms"], "correct_index": 3}}, {"id": "tinyml-0174", "title": "The Dropped Audio Frame", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a Cortex-M4 process a 400 MFLOP, 1-second audio clip in real time without falling behind?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it takes ~1.19 seconds to process, which is longer than the 1-second audio clip duration.", "Yes, it only uses 84% of the processor's capacity (336 MFLOPS / 400 MFLOPs).", "No, it takes ~2.38 seconds because the 168 MHz clock speed only provides 168 MFLOPS.", "Yes, it can process two clips per second, taking about 0.59 seconds per clip."], "correct_index": 0}}, {"id": "tinyml-0175", "title": "The Flash Budget Crunch", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "Assuming sparse compression so pruned INT8 weights aren't stored densely, which listed unstructured-pruning target is the smallest that fits the model into the remaining Flash?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["0% (No pruning is needed)", "58% Sparsity", "26% Sparsity", "84% Sparsity"], "correct_index": 2}}, {"id": "tinyml-0178", "title": "The Sensor Bandwidth Chasm", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much faster is reading from on-chip SRAM compared to a standard I2C bus?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["They are roughly the same speed", "About 100x faster", "About 24,000x faster", "About 1,000x faster"], "correct_index": 2}}, {"id": "tinyml-0180", "title": "The Federated Learning Energy Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the daily energy consumption for data transmission per device for each approach?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: 50 J, Federated: 12.5 J. The federated approach is 4x more energy efficient.", "Centralized: 40 J, Federated: 100 J. The centralized approach is 2.5x more energy efficient.", "Centralized: 400 J, Federated: 100 J. The federated approach is 4x more energy efficient.", "Centralized: 40 mJ, Federated: 10 mJ. The difference is negligible at the fleet level."], "correct_index": 2}}, {"id": "tinyml-0181", "title": "The Microcontroller's Memory Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate Ridge Point for a typical Cortex-M4 microcontroller, and what does this value signify for ML workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~336 Ops/Byte (Assuming peak MFLOPS directly maps to Ops/Byte without bandwidth division).", "~1.2 Ops/Byte (Confusing GB/s with Ridge Point directly).", "~0.28 Ops/Byte. It's heavily compute-bound.", "~168 Ops/Byte (Using clock speed instead of MFLOPS)."], "correct_index": 2}}, {"id": "tinyml-0183", "title": "The Race-to-Sleep Dilemma", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why can a higher-power Cortex-M7 consume less energy than a Cortex-M4 for a fixed 100 MFLOP keyword-spotting inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4, because its active power rating (30mW) is lower.", "The Cortex-M7, because it finishes the computation faster, spending less time in an active state.", "They are equally energy-efficient because the total number of FLOPs is the same for both.", "It's impossible to know without the sleep power consumption for each MCU."], "correct_index": 1}}, {"id": "tinyml-0184", "title": "The TinyML Memory Wall: SRAM vs. Flash", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "On a typical microcontroller used for a TinyML application, roughly how much slower is reading the model's weights from Flash memory compared to accessing the tensor arena in on-chip SRAM?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 1}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x slower", "About 1,000x slower", "About 10-15x slower", "They are nearly the same speed"], "correct_index": 2}}, {"id": "tinyml-0185", "title": "The SRAM Tensor Arena Puzzle", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What minimum Tensor Arena size is needed for the keyword-spotting execution plan based on peak concurrent tensors?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 0}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["54 KB, the sum of all tensor sizes.", "30 KB, the size of the largest tensor.", "42 KB, the peak concurrent memory usage during the first layer's operation.", "40 KB, the peak concurrent memory usage during the second layer's operation."], "correct_index": 2}}, {"id": "tinyml-0186", "title": "The DMA Power-Saving Trade-Off", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which transfer approach minimizes energy for the 4 KB sensor sample, CPU memcpy or DMA with CPU sleep?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["CPU copy, because its total latency is lower (1.5 µJ vs 1.75 µJ calculated improperly).", "DMA, because it allows the power-hungry CPU to sleep during the transfer, saving significant energy.", "CPU copy, because the DMA setup overhead makes it inefficient for small data transfers.", "They are equivalent in power consumption because the transfer time is the same in both scenarios."], "correct_index": 1}}, {"id": "tinyml-0190", "title": "The Millisecond Machine Stop", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the typical, non-negotiable latency budget for such a hard real-time interrupt?", "chain_ids": ["tinyml-chain-auto-secondary-003-13"], "chain_positions": {"tinyml-chain-auto-secondary-003-13": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms (Typical for a cloud service response)", "33 ms (Typical for a real-time video frame on an edge device)", "16 ms (The budget to avoid UI 'jank' on a mobile device)", "1 ms (The budget for a hardware interrupt)"], "correct_index": 3}}, {"id": "tinyml-0192", "title": "The Energy Harvesting Deficit", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the Cortex-M7 air-quality sensor run indefinitely from a 2.0 mW solar panel while waking for 0.5 s every 10 s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, it is sustainable; sleep power is negligible compared to generation.", "No, it is not sustainable; active power of 50mW far exceeds generation.", "No, it is not sustainable; it has a net energy deficit of ~0.5mW.", "Yes, it is sustainable; it has a net energy surplus of ~1.5mW."], "correct_index": 2}}, {"id": "tinyml-0193", "title": "The Energy Cost of Privacy", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For the purpose of total device energy consumption, which of these two operations is more expensive?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Running the model locally is more expensive; ML compute is always the bottleneck.", "The energy costs are roughly equal.", "Transmitting the image is ~20x more expensive.", "Transmitting the image is over 1,000x more expensive."], "correct_index": 2}}, {"id": "tinyml-0194", "title": "The Federated Learning Battery Dividend", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the difference in annual energy consumption between the two strategies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Centralized strategy consumes approximately 11.0 Joules more per year.", "The Centralized strategy consumes approximately 109.5 Joules more per year.", "The Centralized strategy consumes approximately 98.6 Joules more per year.", "The Centralized strategy consumes approximately 0.27 Joules more per year."], "correct_index": 2}}, {"id": "tinyml-0195", "title": "The Microcontroller Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'ridge point' for a Cortex-M4, and what does its value imply about where performance bottlenecks are likely to occur?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte, meaning most models are memory-bound.", "~0.2 Ops/Byte, meaning most models are memory-bound.", "~0.2 Ops/Byte, meaning most models are compute-bound.", "~20 Ops/Byte, meaning models are well-balanced between compute and memory."], "correct_index": 2}}, {"id": "tinyml-0198", "title": "The DMA Double-Buffer Lifeline", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much SRAM is required for the double-buffered 16 kHz, 16-bit audio pipeline when inference takes 120 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3,840 bytes", "30,720 bytes", "7,680 bytes", "32,000 bytes"], "correct_index": 2}}, {"id": "tinyml-0199", "title": "The Quantization Energy Dividend", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Approximately how much more energy does a single 32-bit floating-point (FP32) operation consume compared to an 8-bit integer (INT8) operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Around 4x more energy.", "Around 18x more energy.", "Around 8x more energy.", "The energy savings are negligible (~1.2x)."], "correct_index": 1}}, {"id": "tinyml-0202", "title": "The Real-Time Deadline Trap", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How long after the second audio chunk arrives will its processing complete on the single-threaded Cortex-M4?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 0}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["600ms", "1000ms", "1200ms", "400ms"], "correct_index": 2}}, {"id": "tinyml-0203", "title": "The Real-Time Wakeword Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can a 168 MHz Cortex-M4 finish a 70 MFLOP KWS inference before the next 1000 ms audio chunk arrives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it will take ~417 ms.", "No, it will take ~4.8 seconds.", "Yes, it will take ~208 ms.", "Yes, it will take only 0.208 ms."], "correct_index": 2}}, {"id": "tinyml-0204", "title": "The Solar-Powered Sensor's Inference Budget", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To remain power-neutral, what is the maximum number of inferences the station can perform per hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["450 inferences per hour", "1,796 inferences per hour", "1,800 inferences per hour", "18,000 inferences per hour"], "correct_index": 1}}, {"id": "tinyml-0205", "title": "The Privacy-First Principle of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary reason to choose a Federated Learning approach for model updates instead of collecting all audio data in the cloud to retrain a central model?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 0}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["To achieve higher model accuracy than a centrally trained model.", "To lower the power consumption of the device during the learning process.", "To preserve user privacy by not sending raw audio data to the cloud.", "To reduce the network bandwidth costs of downloading the final, large model."], "correct_index": 2}}, {"id": "tinyml-0207", "title": "The Microcontroller's Compute Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What does the Ridge Point of a roofline model for a Cortex-M4 indicate about its performance, and what is its approximate value?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~5.0 Bytes/Op, meaning most workloads are memory-bound. (Inverted ratio trap)", "~1.6 Ops/Byte, meaning memory and compute are perfectly balanced. (Forgot 8-bit to 32-bit word size multiplier)", "~0.2 Ops/Byte, meaning most workloads are compute-bound.", "~0.025 Ops/Bit, meaning memory bandwidth is severely constrained. (Bit vs Byte confusion trap)"], "correct_index": 2}}, {"id": "tinyml-0208", "title": "TinyML Roofline: Compute or Memory Bound?", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the layer's arithmetic intensity, and is it compute-bound or memory-bound on the Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because its intensity is ~2.07 Bytes/Op.", "Memory-bound, because an Arithmetic Intensity of ~0.48 Ops/Byte is very low.", "Compute-bound, because its Arithmetic Intensity (~0.48 Ops/Byte) is higher than the Cortex-M4's ridge point (~0.28 Ops/Byte).", "Compute-bound, because all ML operations on microcontrollers are limited by CPU speed."], "correct_index": 2}}, {"id": "tinyml-0209", "title": "TinyML Tensor Arena Sizing", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To prevent memory allocation errors, what is the minimum required size for the Tensor Arena?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["25 KB", "49 KB", "35 KB", "37 KB"], "correct_index": 2}}, {"id": "tinyml-0210", "title": "DMA vs. CPU for Sensor Data", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much CPU time does a direct memcpy of the 1-second audio buffer take compared with a DMA transfer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["DMA is slower due to a 1.5ms setup overhead that exceeds the memcpy time.", "The CPU copy takes ~0.76ms, while the DMA transfer takes ~0 CPU time.", "Both take ~0.76ms because they share the same physical memory bus.", "The CPU copy takes ~7.6ms because you need 40 cycles/byte."], "correct_index": 1}}, {"id": "tinyml-0211", "title": "The Quantization Energy Cliff", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "From a pure hardware physics perspective, approximately how much more energy does a single FP32 compute operation consume compared to a single INT8 operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4×", "~3.4×", "~18×", "~100×"], "correct_index": 2}}, {"id": "tinyml-0212", "title": "The Depthwise Convolution Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What approximate compute reduction does a 3x3 depthwise separable convolution provide over a standard convolution?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 0}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About a 3x reduction", "The savings are negligible on microcontrollers", "About a 9x reduction", "It's a 2x reduction, same as using FP16 instead of FP32"], "correct_index": 2}}, {"id": "tinyml-0213", "title": "The Unforgiving Audio Buffer", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To guarantee you never lose any incoming audio, what is the absolute hard real-time deadline by which inference must complete?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1ms", "33ms", "100ms", "16ms"], "correct_index": 2}}, {"id": "tinyml-0214", "title": "The Dropped Audio Packet", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the Cortex-M4 process the 200 MFLOP audio chunk within the strict 1000 ms real-time deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, it would take over 500 seconds to process one chunk.", "No, it can only process about 1.68 chunks per second, which is too slow.", "Yes, the ideal estimate is about 595 ms; with 15% overhead it is about 684 ms, still below the 1000 ms deadline.", "Yes, but the 95ms of slack time is too small for a production system."], "correct_index": 2}}, {"id": "tinyml-0215", "title": "The Sleep-Wake Power Chasm", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the approximate ratio of active power consumption to deep sleep power consumption for a typical Cortex-M4 class microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10x", "~100x", ">1,000x", "They are roughly the same"], "correct_index": 2}}, {"id": "tinyml-0216", "title": "The TCO of TinyML: On-Device vs. Cloud Power", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What are the average power draws of the TinyML and cloud approaches over the 60-second cycle?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud peak power is 200 mW and TinyML is 10 mW, so it uses 20x more power.", "The TinyML device uses about 10 mW on average because the sleep power is negligible.", "Cloud uses ~16.7 mW and TinyML uses ~0.18 mW, a difference of nearly 100x.", "Both are in the low mW range; the power difference is not significant for TCO."], "correct_index": 2}}, {"id": "tinyml-0217", "title": "The Microcontroller's Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Given its hardware specifications, what is the primary performance bottleneck you would typically expect to encounter according to the Roofline Model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, as the MCU's MAC throughput is the main constraint.", "Memory-bound, as the data movement from SRAM to the CPU is the main constraint.", "Power-bound, as the MCU cannot draw enough power to run at its peak frequency.", "Flash-bound, as reading the model weights from flash storage is the bottleneck."], "correct_index": 0}}, {"id": "tinyml-0219", "title": "The SRAM Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What tensor-arena peak is required for 40 KB input, 32 KB Conv1, and 8 KB Conv2 tensors on a 256 KB MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 KB", "32 KB", "72 KB", "40 KB"], "correct_index": 2}}, {"id": "tinyml-0220", "title": "Hidden Cost of CPU memcpy", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What CPU opportunity cost does a CPU-driven memcpy impose for a 4 KB audio frame compared to using DMA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The CPU is busy for ~24µs with memcpy, which is negligible in a 20ms budget.", "DMA is slower because it has configuration overhead.", "The CPU is busy for ~24µs with memcpy, stalling inference, while it's only busy for <1µs with DMA, enabling parallelism.", "Both methods take roughly the same time since SRAM bandwidth is the bottleneck."], "correct_index": 2}}, {"id": "tinyml-0223", "title": "The Interrupt Latency Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hard real-time latency budget should an interrupt-triggered TinyML inference target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["100 ms", "33 ms", "16 ms", "1 ms"], "correct_index": 3}}, {"id": "tinyml-0224", "title": "The Real-Time Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the system meet its hard real-time deadline of finishing one clip before the next 1-second clip arrives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~298 ms", "~303 ms", "~3.36 s", "~8 ms"], "correct_index": 1}}, {"id": "tinyml-0227", "title": "The TinyML Update Cost Fallacy", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the numbers provided, which of the following is the largest cost associated with this single, global update?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cloud compute cost (GPU hours) to train the new model.", "The aggregate cellular data transmission cost.", "The electricity cost for all one million devices to power their modems during the download.", "The initial hardware (CapEx) cost of the microcontrollers in the fleet."], "correct_index": 1}}, {"id": "tinyml-0228", "title": "The TinyML Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What does the 0.28 Ops/Byte ridge point mean on a Cortex-M4 roofline plot?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 0}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The peak theoretical performance (MFLOPS) of the microcontroller.", "The maximum memory bandwidth (GB/s) of the on-chip SRAM.", "The minimum Arithmetic Intensity (Ops/Byte) needed to be compute-bound.", "The energy cost (in pJ) of a single memory access."], "correct_index": 2}}, {"id": "tinyml-0229", "title": "The MCU Memory Wall", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the Cortex-M4 ridge point, and what does it imply about whether typical ML layers are compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~280 Ops/Byte, making it profoundly memory-bound.", "~3.57 Bytes/Op, meaning it's inefficient.", "~0.28 Ops/Byte, making it profoundly memory-bound.", "~0.28 Ops/Byte, making it typically compute-bound."], "correct_index": 3}}, {"id": "tinyml-0232", "title": "The Depthwise Separable Advantage", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary advantage of using a depthwise separable convolution instead of a standard convolution in this context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It significantly increases model accuracy by capturing more complex features.", "It allows the model to handle variable-length audio inputs without padding.", "It dramatically reduces the number of parameters and computations (MACs).", "It's inherently more robust to noise in the audio signal."], "correct_index": 2}}, {"id": "tinyml-0234", "title": "The Real-Time Radar Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which performance metric is the most critical to optimize to meet this product requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Throughput (inferences/sec)", "Time To First Token (TTFT)", "End-to-End Latency", "Power Consumption (mW)"], "correct_index": 2}}, {"id": "tinyml-0241", "title": "Microcontroller Performance Reality", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Based on its fundamental hardware architecture, would you expect the model's performance to be limited by its compute capability (compute-bound) or by its memory bandwidth (memory-bound)?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because all neural networks are memory-bound.", "Memory-bound, because microcontrollers have very little SRAM.", "Compute-bound, because the FPU is weak relative to the fast SRAM bandwidth.", "I/O bound, because reading from the microphone sensor is the slowest part."], "correct_index": 2}}, {"id": "tinyml-0246", "title": "The Depthwise Memory Footprint", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What parameter reduction comes from replacing a 3x3 16-to-32 standard convolution with depthwise separable convolution?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 1}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A ~32x reduction in parameters", "A ~2x reduction in parameters", "A ~7x reduction in parameters", "No significant change in parameters"], "correct_index": 2}}, {"id": "tinyml-0250", "title": "The TinyML Memory Diet: MCU Compute Constraints", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate and explain the difference in the model's weight storage before and after INT8 quantization?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 0}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model size is reduced from 1000 KB to 250 KB, saving 750 KB.", "The model size is reduced by a factor of 8x because you go from 16-bit floats to 8-bit integers.", "The model size is reduced from 500 KB to 250 KB, saving 250 KB.", "The model size is reduced from 500 KB to 125 KB, saving 375 KB."], "correct_index": 2}}, {"id": "tinyml-0254", "title": "The SRAM Tensor Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the minimum tensor arena size required when the 40 KB input, 20 KB intermediate, and 10 KB output tensors overlap?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 1}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["70 KB", "40 KB", "60 KB", "30 KB"], "correct_index": 2}}, {"id": "tinyml-0256", "title": "The SRAM Memory Ceiling", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What typical on-chip SRAM range should you assume for a Cortex-M4 class TinyML device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["80 GB", "8 GB", "256 KB - 2 MB", "32 MB"], "correct_index": 2}}, {"id": "tinyml-0258", "title": "The Audio Buffer Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To ensure no audio data is ever lost, what is the absolute maximum processing latency your model can have to process one buffer?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 1}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["62.5 us", "640 ms", "1000 ms", "500 ms"], "correct_index": 2}}, {"id": "tinyml-0259", "title": "The Truck Roll Multiplier", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Over the product's 5-year lifespan, what single factor is most likely to dominate the Total Cost of Ownership (TCO)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud computing costs for aggregating data and training model updates.", "Initial hardware purchase (CapEx) of the 100,000 devices.", "Physical maintenance and battery replacement ('truck rolls').", "Energy consumption of the entire device fleet over 5 years."], "correct_index": 2}}, {"id": "tinyml-0261", "title": "The Interrupt Deadline: Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What hard real-time latency budget is typical for interrupt-driven TinyML inference?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~100 ms", "~33 ms", "~16 ms", "~1 ms"], "correct_index": 3}}, {"id": "tinyml-0262", "title": "The TinyML Compute Threshold", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'ridge point'—the minimum arithmetic intensity (Ops/Byte) required for a workload to become compute-bound on this class of device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte", "~10 Ops/Byte", "~0.5 Ops/Byte", "~50 Ops/Byte"], "correct_index": 2}}, {"id": "tinyml-0263", "title": "The MCU Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the approximate ridge point for this MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~280 Ops/Byte", "~3.57 Bytes/Op", "~0.28 Ops/Byte", "~1,342 Ops/Byte"], "correct_index": 2}}, {"id": "tinyml-0266", "title": "The SRAM vs. Flash Fallacy", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate *working memory* (SRAM) you would state is actually available for the model's runtime operations like activations?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 0}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~8 MB", "~1 MB", "~256 KB", "~32 KB"], "correct_index": 2}}, {"id": "tinyml-0269", "title": "The TinyML Economics of Inference: On-Device vs. Cloud", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which approach is more energy-efficient for hourly keyword checks, on-device inference or cloud-assisted transmission?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud-assisted is more efficient because it avoids running complex computation on the low-power device.", "The energy difference is negligible because sleep power dominates the total consumption over 24 hours.", "On-device is >10x more energy-efficient because the radio transmission power is far greater than the local compute power.", "They are roughly equal; the energy saved from not computing locally is offset by the energy spent on transmission."], "correct_index": 2}}, {"id": "tinyml-0273", "title": "The INT8 Energy Dividend", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much more energy-efficient is a single INT8 MAC operation compared to a single FP32 MAC operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.4x more efficient", "~4x more efficient", "~18x more efficient", "~580x more efficient"], "correct_index": 2}}, {"id": "tinyml-0274", "title": "The Depthwise Efficiency Dividend", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What MAC reduction should a 3x3 depthwise separable convolution deliver relative to a standard 3x3 convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It is about 2x cheaper.", "They are computationally equivalent.", "It is about 9x cheaper.", "It is about 9x more expensive."], "correct_index": 2}}, {"id": "tinyml-0275", "title": "The Depthwise Separable Memory Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What parameter reduction results from replacing the 3x3 16-to-32 channel convolution with a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A ~32x reduction.", "A ~9x reduction.", "A ~7x reduction.", "A ~2x reduction."], "correct_index": 2}}, {"id": "tinyml-0278", "title": "The Microcontroller's Low Ridge", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the hardware arithmetic intensity (Ridge Point) of a Cortex-M4, where does the bottleneck lie for most neural network operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ridge point is very high (~280 Ops/Byte), making workloads memory-bound.", "The ridge point is very low (~0.28 Ops/Byte), but this means workloads are always memory-bound.", "The ridge point is very low (~0.28 Ops/Byte), making most neural network workloads compute-bound.", "The ridge point is irrelevant for microcontrollers; only power matters."], "correct_index": 2}}, {"id": "tinyml-0281", "title": "The INT8 Energy Prize", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What per-operation energy savings should FP32-to-INT8 quantization provide for the coin-cell keyword spotting model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~4x savings", "~3.4x savings", "~18x savings", "~100x savings"], "correct_index": 2}}, {"id": "tinyml-0284", "title": "The Energy Tax of the Cloud", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which energy component dominates TCO for the always-listening smart home sensor, on-device compute or network streaming?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The on-device compute energy, because neural network FLOPs are expensive.", "The network streaming energy and on-device compute are roughly equal.", "The network streaming energy, by several orders of magnitude.", "The network streaming energy, but only by a small amount."], "correct_index": 2}}, {"id": "tinyml-0285", "title": "The Privacy Premium: On-Device vs. Cloud TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Assuming no other differing costs, what is the Total Cost of Ownership (TCO) difference between the two architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["$165,000 cheaper", "$795,000 cheaper", "$895,000 cheaper", "$1,095,000 cheaper"], "correct_index": 2}}, {"id": "tinyml-0286", "title": "The Microcontroller's Low Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What fundamental performance bottleneck does this extremely low value imply for most neural network workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The device is almost always memory-bound.", "The device is almost always compute-bound.", "The device is almost always power-bound.", "The device's performance is limited by its flash storage speed."], "correct_index": 1}}, {"id": "tinyml-0287", "title": "The MCU Ridge Point: MCU Compute Constraints", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Calculate the Ridge Point for this Cortex-M4 and interpret what it means for the relationship between compute and memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~3.57 Bytes/Op. It means most workloads will be memory-bound.", "~280 Ops/Byte. It means the chip requires extremely high arithmetic intensity to be compute-bound.", "~0.28 Ops/Byte. It means most ML workloads will be compute-bound.", "~1.4 Ops/Byte. This would be typical for a more powerful edge device, not an MCU."], "correct_index": 2}}, {"id": "tinyml-0290", "title": "The Depthwise Separable Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Why does replacing a 3x3 standard convolution with depthwise separable convolution help a Cortex-M4 CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It primarily reduces the model's parameter count and flash memory footprint.", "It significantly improves the model's prediction accuracy.", "It primarily reduces the number of computational operations (FLOPs).", "It enables the use of specialized hardware instructions on the microcontroller."], "correct_index": 2}}, {"id": "tinyml-0291", "title": "The Depthwise Memory Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How would you calculate the total number of weight parameters required for this single depthwise separable layer, assuming INT8 precision?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["432 parameters", "27 parameters", "75 parameters", "48 parameters"], "correct_index": 2}}, {"id": "tinyml-0292", "title": "The Tyranny of Sleep Current", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "To meet a 5-year battery life goal, which power metric is the most critical to minimize?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Active power during inference (mW)", "Deep sleep power consumption (µW)", "Model size on Flash (KB)", "Peak compute performance (MFLOPS)"], "correct_index": 1}}, {"id": "tinyml-0294", "title": "The Microcontroller's Compute Limit", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'ridge point' (operational intensity) for this class of device, and what does it tell you about where the bottleneck usually is?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte; most workloads are memory-bound.", "~0.2 Ops/Byte; most workloads are memory-bound.", "~0.2 Ops/Byte; most workloads are compute-bound.", "~1,300 Ops/Byte; workloads are balanced between compute and memory."], "correct_index": 2}}, {"id": "tinyml-0296", "title": "The TinyML SRAM Budget", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What SRAM footprint should you count for the keyword-spotting deployment, and why is the Flash-stored model size irrelevant?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 0}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["272 KB. It won't fit.", "68 KB. It will fit.", "92 KB. It will fit.", "164 KB. It will fit."], "correct_index": 2}}, {"id": "tinyml-0300", "title": "The Power Budget Chasm", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate ratio of power consumed by a typical TinyML microcontroller when actively running inference versus when it is in deep sleep?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A. ~10x", "B. ~100x", "C. ~10,000x", "D. ~1,000,000x"], "correct_index": 2}}, {"id": "tinyml-0301", "title": "The Economics of Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Calculate the total daily data upload volume for this federated system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["50 GB", "500 KB", "5 GB", "5 MB"], "correct_index": 2}}, {"id": "tinyml-0302", "title": "The TinyML Memory Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate 'Ridge Point' for a typical Cortex-M4, which tells us its operational intensity in Ops-per-Byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~336 Ops/Byte (Using MFLOPS directly, ignoring bandwidth).", "~1.2 Ops/Byte (Confusing bandwidth with the Ridge Point).", "~0.28 Ops/Byte (Correct calculation: 336e6 ops/s / 1.2e9 bytes/s).", "~168 Ops/Byte (Using MHz instead of MFLOPS)."], "correct_index": 2}}, {"id": "tinyml-0311", "title": "The MCU's Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the dense layer's arithmetic intensity, and is it compute-bound or memory-bound on the Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because the layer's Arithmetic Intensity (~1.95 Ops/Byte) is greater than the MCU's ridge point.", "Memory-bound, because all operations on resource-constrained devices like microcontrollers are limited by memory bandwidth.", "Compute-bound, because the layer's Arithmetic Intensity (~1.95 Ops/Byte) is greater than the MCU's ridge point (~0.28 Ops/Byte).", "Compute-bound, because its Arithmetic Intensity is low, which means it doesn't require much data from memory."], "correct_index": 2}}, {"id": "tinyml-0314", "title": "The Separable Convolution Cost-Cutter", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary advantage of this architectural change for a resource-constrained device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0319", "title": "The Microcontroller Roofline", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity of the FP32 3x3 depthwise convolution, and is it compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.2 Ops/Byte; Memory-Bound", "~1.1 Ops/Byte; Compute-Bound", "~2.2 Ops/Byte; Compute-Bound", "~0.28 Ops/Byte; Memory-Bound"], "correct_index": 2}}, {"id": "tinyml-0321", "title": "The Energy Cost of Precision: Quantization Fundamentals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Roughly how much more energy does a 16-bit floating-point (FP16) multiplication consume compared to an 8-bit integer (INT8) multiplication?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["About 2x more energy", "Roughly the same, as energy is dominated by memory access", "About 5x more energy", "About 18x more energy"], "correct_index": 2}}, {"id": "tinyml-0322", "title": "The Kilobyte Wall", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the typical hardware specifications for TinyML devices, which resource constraint will you almost certainly hit first and is generally the hardest to overcome?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute (MFLOPS)", "Power Draw (mW)", "Memory (SRAM)", "Flash Storage Size"], "correct_index": 2}}, {"id": "tinyml-0323", "title": "The Flash Budget Diet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many INT8 parameters are saved by replacing a 3x3 16-to-32 convolution with a depthwise separable layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The parameters are reduced by ~32x", "The parameters are reduced by ~9x", "The parameters are reduced by ~7x", "The parameters are reduced by ~2x"], "correct_index": 2}}, {"id": "tinyml-0324", "title": "The Economic Viability of Sleep", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What deep-sleep power consumption should you expect from a modern microcontroller in a long-life TinyML device?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~10 milliwatts (mW)", "~100 microwatts (µW)", "~10 microwatts (µW)", "~10 nanowatts (nW)"], "correct_index": 2}}, {"id": "tinyml-0326", "title": "Microcontroller's Memory Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on typical hardware specs, what is its approximate ridge point (Ops/Byte), and what does this imply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~300 Ops/Byte, implying workloads are always compute-bound.", "~10 Ops/Byte, implying a balance between compute and memory.", "~0.28 Ops/Byte, implying workloads are almost always compute-bound.", "The concept of a ridge point does not apply to microcontrollers, only GPUs."], "correct_index": 2}}, {"id": "tinyml-0328", "title": "The Tensor Arena Sizing Puzzle", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What tensor arena size follows from the peak coexisting tensors rather than summing every tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["71 KB", "40 KB", "60 KB", "50 KB"], "correct_index": 2}}, {"id": "tinyml-0331", "title": "The Depthwise Separable Memory Diet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How much INT8 parameter memory does a 3x3, 32-input, 64-output standard convolution require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~2.3 KB", "~8.2 KB", "~18.0 KB", "288 Bytes"], "correct_index": 2}}, {"id": "tinyml-0332", "title": "The Privacy Power Tax", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a hardware physics perspective, what is the primary cost associated with this privacy-enhancing computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Increased Flash memory required to store the computation model.", "Increased network latency to transmit the computed features.", "Increased total energy consumption due to longer active time.", "Increased peak MFLOPS demand on the microcontroller."], "correct_index": 2}}, {"id": "tinyml-0333", "title": "The TCO of a Sleeping Army", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "At $0.15 per kWh, what is the approximate total energy cost to operate this 100,000-sensor fleet for one year?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~$1,314", "~$14,440", "~$14.44", "< $0.01"], "correct_index": 2}}, {"id": "tinyml-0334", "title": "The Microcontroller's Memory Wall: MCU Compute Constraints", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the fundamental architectural trade-offs of this class of device, would you generally expect the model's performance to be compute-bound or memory-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Compute-bound, because the typical arithmetic intensity of neural network layers is higher than the microcontroller's low ridge point.", "Memory-bound, because the chip's memory bandwidth is very low compared to its processing speed.", "Power-bound, because the device has a strict thermal design power (TDP) of a few milliwatts.", "It depends entirely on whether the model uses depthwise or standard convolutions."], "correct_index": 0}}, {"id": "tinyml-0335", "title": "The TinyML Compute-Memory Tradeoff", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Is this convolutional layer compute-bound or memory-bound on the Cortex-M4?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 1}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound, because TinyML devices have very little SRAM, so they are always waiting on memory.", "Memory-bound, because the arithmetic intensity is ~0.1 Ops/Byte, which is lower than the ridge point.", "Compute-bound, because the layer's arithmetic intensity (~81 Ops/Byte) is much higher than the MCU's ridge point (~0.28 Ops/Byte).", "Compute-bound, because its power consumption is high, which means it is doing a lot of computation."], "correct_index": 2}}, {"id": "tinyml-0336", "title": "The TinyML Tensor Arena Squeeze", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What peak tensor arena memory is needed at Layer 5, and does it fit within 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["2.0 KB. The arena only needs to hold the single largest tensor (Layer 5 output).", "1.0 KB. Only the smallest input tensor matters.", "3.0 KB. The sum of the input (1.0 KB) and output (2.0 KB) tensors for the peak operation.", "6.0 KB. You must sum all tensors in the model."], "correct_index": 2}}, {"id": "tinyml-0338", "title": "The Depthwise Dividend", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the approximate ratio of FLOPs between the standard and the optimized layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Approximately 2x reduction", "Approximately 8-9x reduction", "Approximately 64x reduction (equal to the number of output channels)", "The reduction is equal to the stride of the convolution"], "correct_index": 1}}, {"id": "tinyml-0340", "title": "The Wildlife Camera's Lifespan", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the approximate battery life in days after computing the wildlife camera's duty-cycled average power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~1.1 days", "~5.6 days", "~4.8 days", "~0.6 days"], "correct_index": 2}}, {"id": "tinyml-0344", "title": "The Microcontroller's Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity (in FLOPs/Byte) required to saturate the processor's compute capability?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.57 Ops/Byte", "280 Ops/Byte", "0.28 Ops/Byte", "295 Ops/Byte"], "correct_index": 2}}, {"id": "tinyml-0345", "title": "The TinyML Tensor Arena: Tensor Arena Planning", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "To minimize RAM usage, what is the absolute minimum size required for the tensor arena?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["30 KB", "50 KB", "100 KB", "400 KB"], "correct_index": 1}}, {"id": "tinyml-0346", "title": "The INT8 Energy Payoff", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "From a pure computational energy perspective, what is the approximate energy saving for a single operation when you move from FP32 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It consumes ~4× less energy.", "It consumes ~2× less energy.", "It consumes ~18× less energy.", "The energy consumption is roughly the same."], "correct_index": 2}}, {"id": "tinyml-0348", "title": "The Flash Memory Bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What parameter reduction does a 3x3 depthwise separable convolution provide for the 64-to-128 channel layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Roughly 2x. The savings are minor.", "Roughly 8.4x. It separates the spatial and cross-channel operations, drastically cutting parameters.", "Roughly 128x. It only performs the depthwise step, which is highly efficient.", "There is no reduction; it's a compute optimization, not a memory optimization."], "correct_index": 1}}, {"id": "tinyml-0349", "title": "The Flash Budget Squeeze", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on typical resource allocation in TinyML systems, which component represents the largest \"tax\" on your flash budget, directly competing with the size of your ML model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Real-Time Operating System (RTOS)", "The Bootloader", "The Over-the-Air (OTA) download partition", "The ML model's activation buffers stored in flash"], "correct_index": 2}}, {"id": "tinyml-0352", "title": "The Cortex-M4 Ridge Point", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does the Roofline ridge point signify, and what is its calculated value for the given hardware specifications?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.57 Ops/Byte", "0.28 Ops/Byte", "280 Ops/Byte", "295 Ops/Byte"], "correct_index": 1}}, {"id": "tinyml-0357", "title": "The TCO of Transmission", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What recurring TCO cost is reduced most by running TinyML inference on-device instead of streaming audio?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The recurring cost of cloud inference endpoints.", "The upfront Bill of Materials (BOM) cost for a more powerful microcontroller.", "The energy cost of wireless data transmission.", "The engineering cost of developing and maintaining the cloud data ingestion pipeline."], "correct_index": 2}}, {"id": "tinyml-0358", "title": "Cortex-M4 Roofline Compute Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the typical hardware characteristics of a Cortex-M4, what is the primary performance bottleneck you are likely facing?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system is compute-bound; the CPU's MFLOPS rating is the bottleneck.", "The system is memory-bound; data cannot be fed to the CPU fast enough.", "The system is power-bound; the model is causing the chip to thermally throttle.", "The system is compiler-bound; the toolchain isn't generating efficient instructions."], "correct_index": 0}}, {"id": "tinyml-0359", "title": "The Microcontroller's Low Bar", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What does a 0.28 Ops/Byte ridge point on Cortex-M4 mean for choosing TinyML model architectures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["3.57 Ops/Byte. Workloads must perform over 3 operations per byte to be compute-bound.", "280 Ops/Byte. Nearly all workloads will be severely memory-bound.", "0.28 Ops/Byte. Most workloads will be compute-bound as they easily exceed this low arithmetic intensity requirement.", "29.5 Ops/Byte. This is a typical ridge point, making the choice of compute- vs memory-intensive layers critical."], "correct_index": 2}}, {"id": "tinyml-0360", "title": "The Tensor Arena Budget", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the peak required tensor arena size, and will it fit within the available SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["110 KB; No, it exceeds the 106 KB of available SRAM.", "110 KB; Yes, it fits easily within the total 256 KB of SRAM.", "50 KB; Yes, the peak is determined by the largest tensor, which fits.", "90 KB; Yes, it fits with room to spare."], "correct_index": 0}}, {"id": "tinyml-0364", "title": "The Economics of Fleet Updates: Centralized vs. Federated", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which update strategy costs less in data transfer for 1M doorbells: centralized image uploads or federated gradients?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 1}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized requires ~10 GB ($0.90); Federated requires ~600 GB ($54).", "Centralized requires ~1,500 GB; Federated requires ~1,500 GB.", "Centralized requires ~1,500 GB ($135); Federated requires ~600 GB ($54). Federated is 2.5x cheaper.", "Federated requires ~1,500 GB; Centralized requires ~600 GB."], "correct_index": 2}}, {"id": "tinyml-0365", "title": "The Real-Time Frame Budget", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the maximum processing time per frame to guarantee zero dropped frames on a 10 FPS camera stream?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["10 ms", "33 ms", "100 ms", "1 ms"], "correct_index": 2}}, {"id": "tinyml-0367", "title": "The Energy Cost of Learning on the Edge", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a typical TinyML device, which single operation is the most significant contributor to its energy consumption during one federated learning cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Reading the training data from on-chip Flash memory.", "Performing the on-device model training (computation).", "Transmitting the model update to the server.", "Maintaining the device in its low-power sleep state between cycles."], "correct_index": 2}}, {"id": "tinyml-0371", "title": "The INT8 Memory Diet", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the expected reduction factor for the memory occupied by the model's weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["4x", "~3.4x", "2x", "1.1x"], "correct_index": 2}}, {"id": "tinyml-0374", "title": "The Economics of On-Device Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which weekly update strategy sends less data for 1M thermostats: 100 KB raw daily uploads or 250 KB federated weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized: 100 GB/day, Federated: 250 GB/day. Centralized is better.", "Centralized: 100 MB/day, Federated: ~36 MB/day. The difference is minor.", "Centralized: 100 GB/day, Federated: ~36 GB/day. Federated is ~3x more efficient.", "Centralized: 100 KB/day, Federated: 250 KB/day. Centralized is better."], "correct_index": 2}}, {"id": "tinyml-0381", "title": "The NAS Discovery on a Microcontroller", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the primary, most impactful reason the NAS prefers depthwise separable convolutions in such a constrained environment?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 0}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It has a higher FLOP-per-byte ratio, improving its arithmetic intensity.", "It consistently improves the final accuracy of the model.", "It dramatically reduces the number of parameters and computations.", "It is more resistant to quantization errors when converting to INT8."], "correct_index": 2}}, {"id": "tinyml-0382", "title": "The Economics of Awakening: A TinyML Power Budget", "topic": "tco-cost-modeling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What average power does the 10,000-sensor TinyML design draw when KWS runs every 10 s and radio transmits hourly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["~160 mW", "~0.11 mW", "~0.30 mW", "~500 mW"], "correct_index": 2}}, {"id": "tinyml-0384", "title": "The Real-Time KWS Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Based on the hardware constants, can this MCU architecture keep up with the continuous stream of audio data without falling behind?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes. The MCU takes ~238 ms per inference (80 MFLOPs / 336 MFLOPS), which is less than the 250 ms deadline from the window stride.", "Yes, easily. The MCU's inference time of ~238 ms is much shorter than the 1000 ms audio clip, leaving over 750 ms of slack.", "No. The MCU is too slow. The required processing time is 4.2 seconds (80 MFLOPs / 20 MFLOPS at 10 MHz effective throughput), which badly misses the 250 ms deadline.", "No. The system needs to process 4 windows per second (1000ms / 250ms), requiring 320 MFLOPS (4 * 80), but the MCU only runs at 168 MHz."], "correct_index": 0}}, {"id": "tinyml-0387", "title": "The Microcontroller Roofline Dilemma", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What is the layer's arithmetic intensity, and does it make the Cortex-M4 execution compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Memory-bound. The Bytes/FLOP ratio is ~0.51, indicating a memory bottleneck.", "Memory-bound. An arithmetic intensity of ~1.95 is high, so it needs a lot of data, saturating the memory bus.", "Compute-bound. The layer's arithmetic intensity (~1.95 FLOPs/Byte) is greater than the Cortex-M4's ridge point (~0.28 FLOPs/Byte).", "Compute-bound. 336 MFLOPS is always the bottleneck on a microcontroller, regardless of data movement."], "correct_index": 2}}, {"id": "tinyml-0388", "title": "The TinyML Tensor Arena Trap", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How should you calculate the minimum tensor arena size for the keyword-spotting model's activation tensors?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 1}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["58 KB", "28 KB", "44 KB", "40 KB"], "correct_index": 2}}, {"id": "tinyml-0390", "title": "The Depthwise Separable Memory Saver", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many parameters are in a standard 3x3 convolution with 16 input channels and 32 filters?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["288 parameters", "656 parameters", "4,608 parameters", "9,216 parameters"], "correct_index": 2}}, {"id": "tinyml-0391", "title": "The Power Cost of Privacy", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many times more energy does cloud transmission consume than on-device Cortex-M4 inference for one audio clip?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's about the same; compute costs are comparable to communication.", "2x more energy", "20x more energy", "100x more energy"], "correct_index": 2}}, {"id": "tinyml-0393", "title": "The Wake-Word Deadline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which metric must you optimize to guarantee the wake-word system meets its hard real-time response deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Throughput in inferences/second", "Batch processing efficiency", "Single-inference end-to-end latency", "Average power consumption over one minute"], "correct_index": 2}}, {"id": "tinyml-0394", "title": "The Real-Time Audio Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Can the audio pipeline run stably in real time when frames arrive every 10 ms but inference takes 45 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, because the 45ms processing time is well under the typical 500ms application deadline for a keyword.", "Yes, if you batch 4 frames together, you can process them all at once.", "No, because the processing time (45ms) is greater than the data arrival interval (10ms).", "No, because the total latency per frame would be 55ms (45ms + 10ms), which is too high."], "correct_index": 2}}, {"id": "tinyml-0403", "title": "The TCO of Privacy: Federated vs. Centralized Data Upload", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Which strategy is more economical for data transfer over a year, and what are the approximate costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The costs are comparable, with Federated Learning saving only a few thousand dollars (Centralized: ~$21,000 vs. Federated: ~$18,000).", "Federated Learning is vastly cheaper, costing about $18 per year compared to over $21,000 for the centralized approach.", "The centralized approach is cheaper, as the 100 KB daily model update is larger than the raw audio stream.", "The centralized approach costs about $2,600 per year, making it more expensive than Federated Learning, but still feasible."], "correct_index": 1}}, {"id": "tinyml-0406", "title": "The Hardware MAC Unit Misconception", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Are a Cortex-M4 and Cortex-M55 equally fast for ML just because both have a single-cycle hardware multiplier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0407", "title": "The Debug Interface Profiling Trap", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the debugger add 14 ms to your inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0408", "title": "The Hardware Divider Stall", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does val / max_val cost about 40 cycles inside a Cortex-M0+ normalization loop?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 0}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0409", "title": "The 16-bit MAC Overflow", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an int16_t accumulator corrupt a 128-element INT8 dot product on a 16-bit MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0410", "title": "The MAC Budget", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can you run inference in under 100ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Yes, a 168 MHz processor does 1 MAC per cycle, completing 5M MACs in ~30 ms.", "No, standard integer execution takes 4 cycles per MAC, missing the budget at ~119 ms.", "Yes, but only by utilizing CMSIS-NN SIMD instructions (2 MACs/cycle), dropping latency to ~15 ms.", "No, even with SIMD it takes 2 cycles per MAC, taking ~60 ms."], "correct_index": 2}}, {"id": "tinyml-0411", "title": "The HVAC False Positive", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does an nRF5340 keyword model's false positive rate jump from 1% to 12% when HVAC cycles every 15 minutes?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0412", "title": "The Watchdog Reset During Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can a 280 ms RP2040 inference trip a 500 ms watchdog every few hours in the field?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 1}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0414", "title": "Bootloader A/B Partition Sizing", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should model size and runtime requirements shape the A/B OTA flash layout, and why do delta model updates help?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0415", "title": "Scalar Cortex-M4 Versus CMSIS-NN Latency", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long should a 6M-MAC DS-CNN take on a 168 MHz scalar Cortex-M4 versus CMSIS-NN on Cortex-M4F?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 2}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0416", "title": "Cortex-M55 Helium Speedup for Depthwise Conv", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the speedup for a 3x3 depthwise convolution when migrating from Cortex-M4F to Cortex-M55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Using sustained throughput for this depthwise kernel, the M55 is about 4x faster despite the slightly lower clock: roughly 1.32 ms on M4F versus 0.35 ms on M55, or ~3.8x.", "Depthwise convolutions cannot be vectorized. The speedup is purely based on the clock speed ratio: 160/168 = 0.95x.", "The M4F wastes cycles on the odd 3x3 kernel size, while the M55 always reaches peak vector throughput. The actual speedup is ~5.1x, exceeding the sustained-throughput estimate.", "The M55 has a dedicated hardware accelerator for depthwise convolutions, resulting in a fixed 10x speedup across all kernel sizes."], "correct_index": 0}}, {"id": "tinyml-0417", "title": "Interrupt Overhead Impact on Inference", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total interrupt overhead during one 18ms inference and determine if it affects the 33ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0418", "title": "The Zero-Point Shift Wreck", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of the garbage output?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 32-bit accumulator is overflowing before the second layer runs.", "The intermediate 32-bit accumulator values are being used by the second layer without being correctly rescaled and shifted to the second layer's zero-point and scale.", "The ReLU6 activation function is not correctly implemented for INT8 inputs.", "The weights for the second layer were quantized using per-tensor instead of per-channel quantization."], "correct_index": 1}}, {"id": "tinyml-0420", "title": "The Keyword Spotting Memory Overflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much Flash does replacing the 128-to-128 standard convolution with a depthwise separable convolution save?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The reduction is about 2x because it's a two-step process.", "It has no effect on Flash memory, only on activation size in SRAM.", "The reduction is approximately 8.4x, saving about 130 KB of Flash.", "The reduction is proportional to the kernel size, so it's a 9x reduction (3*3)."], "correct_index": 2}}, {"id": "tinyml-0421", "title": "The Inverted Residual Bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is an inverted residual block likely faster than a classic residual block on the Cortex-M7 MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's faster because the expansion layer allows for more parallelism on the CPU.", "It's faster by reducing SRAM data traffic, as the large intermediate tensor from the expansion layer is kept in-register.", "It's not faster; the expansion layer increases FLOPs and will make the model slower.", "It's faster because it requires fewer multiply-accumulate operations overall."], "correct_index": 1}}, {"id": "tinyml-0423", "title": "The Mixed-Precision Memory Spike", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What transient memory spike causes the mixed-precision TFLM model to fail during interpreter initialization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The FP32 layer's weights are 4x larger, and the 256KB of SRAM is not enough to hold them during inference.", "The model's total activation memory now exceeds 256KB because one layer uses FP32 precision.", "The de-quantization step requires a temporary 64KB FP32 tensor to be created while the 16KB INT8 input tensor is still in memory, causing an 80KB transient spike.", "The FP32 operation causes memory fragmentation in the Tensor Arena, preventing a large enough contiguous block from being allocated."], "correct_index": 2}}, {"id": "tinyml-0424", "title": "The Silent Factory Floor", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Given INT8's range of [-128, 127], what is the most likely cause of the model's failure in the noisy factory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has overfit to the clean lab data and cannot generalize to the noisy factory environment.", "The Cortex-M4's ~336 MFLOPS is insufficient to process the audio in real-time, causing missed events.", "The high-amplitude factory noise is causing activation values to exceed the INT8 maximum of +127, leading to saturation and information loss.", "The device's microphone is physically clipping the loud audio signal before it even reaches the model."], "correct_index": 2}}, {"id": "tinyml-0427", "title": "The Vision Transformer SRAM Overflow", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the proposed Vision Transformer architecture unsuitable for the 256 KB SRAM Cortex-M7 during runtime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The attention mechanism's FLOPs are too high, making it miss the latency deadline.", "The model's weights are too large to fit in the 256KB of SRAM.", "The quadratic scaling of attention creates intermediate activation tensors that overflow the 256KB of SRAM.", "The patch embedding layer requires 270 KB for floating point operations, crashing the MCU."], "correct_index": 2}}, {"id": "tinyml-0433", "title": "The Sensor Fusion Skew", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What hardware-induced data issue most likely explains the 30% accuracy drop after deploying the quantized sensor fusion model?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The INT8 quantization process has likely removed crucial features from the model weights.", "The MCU memory bandwidth is insufficient to load data quickly enough, causing data corruption.", "Training-serving skew from the production sensors introducing unmodeled jitter and bias shifts the input distribution.", "The device's power management unit is throttling the clock speed, leading to calculation timeouts."], "correct_index": 2}}, {"id": "tinyml-0434", "title": "The Unstable Keyword Augmentation", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does uncontrolled noise mixing ruin the Cortex-M4 keyword start model clean accuracy and false-positive rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The MFCC feature extraction is failing to process the complex augmented audio, creating garbage input vectors.", "The model is too small; a larger model is needed to learn from a more diverse and noisy dataset.", "The Cortex-M4's limited precision (no FPU) is causing numerical underflow when processing the low-energy noise signals.", "The data augmentation created unrealistic samples with uncontrolled Signal-to-Noise Ratios (SNR), corrupting the training data quality."], "correct_index": 3}}, {"id": "tinyml-0435", "title": "The Silent Misfire", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does quiet-office PTQ calibration make a Cortex-M4 KWS model misfire in busy-street noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Weight overflow occurred during the conversion process.", "The Cortex-M4 CPU has bugs in its INT8 processing instructions.", "The model's activations are clipping because the calibration data did not capture the full dynamic range of real-world inputs.", "The model requires FP32 precision and is too complex for INT8 quantization to ever work."], "correct_index": 2}}, {"id": "tinyml-0436", "title": "The Mixed-Precision Memory Budget", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "To meet the 454KB Flash budget while maximizing accuracy, which mixed-precision quantization strategy should you apply?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Keep the CNN layers as FP32 and quantize the RNN layers to INT8.", "The model is too large and must be pruned or redesigned.", "Quantize the CNN layers to INT8 and convert the RNN layers to FP16.", "Quantize all layers to FP16."], "correct_index": 2}}, {"id": "tinyml-0437", "title": "The Micro-Convolution Budget", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What MAC reduction factor results from replacing the standard 3x3 64-to-128 convolution with a depthwise separable one?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 2}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It provides no computational savings, only parameter savings.", "Roughly a 2x reduction in MACs.", "Roughly an 8-9x reduction in MACs.", "Roughly a 64x reduction, proportional to the number of input channels."], "correct_index": 2}}, {"id": "tinyml-0438", "title": "The Transformer's Memory Spike", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this memory spike in the Transformer architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The sequence length N=1600 requires linear storage of ~6.4 KB, which fragments the heap.", "The intermediate N×N attention matrix requires ~10.24 MB of memory, scaling quadratically with sequence length.", "The 4x4 patch embedding creates a 160x160x16 tensor that consumes ~1.6 MB of SRAM.", "The parameter count dictates a 1.02 MB footprint, causing an off-by-one out-of-memory error."], "correct_index": 1}}, {"id": "tinyml-0439", "title": "The Power-Aware Architect", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you diagnose the flaw in this approach and propose a more physically accurate reward function?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The reward is fine; FLOPs are the main driver of power on microcontrollers.", "The reward should be `Accuracy / Parameters` to minimize flash size.", "The reward should model both compute and memory access energy costs, as data movement is a major power drain.", "The reward should be `Accuracy / Latency`, as faster models use less power."], "correct_index": 2}}, {"id": "tinyml-0442", "title": "The Privacy-Preserving Doorbell Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which training strategy has lower fleet data-transfer cost, centralized raw-audio upload or federated model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["On-device training for federated learning will drain the battery too quickly, making it infeasible compared to a simple upload.", "Centralized training is cheaper because cloud GPUs are more energy-efficient than on-device CPUs, leading to lower total energy consumption.", "The cloud cost is the dominant factor; centralized training is ~3.2x more expensive due to transferring 233 TB of raw data vs. 73 TB of model updates annually.", "Centralized training is 3.2x cheaper because you only send 233 GB of data per year compared to 730 GB for federated updates."], "correct_index": 2}}, {"id": "tinyml-0444", "title": "The Saturation Misfire", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the INT8 KWS model misfire on door slams when first-layer activations clip to 127?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4's computational power is insufficient, causing skipped samples during high-energy events.", "INT8 precision is inherently too low for audio tasks; the model must be deployed in FP16 or FP32.", "The quantization calibration range is too narrow due to an unrepresentative dataset, causing activation saturation.", "The model is overfitting to the training data and requires more dropout or regularization."], "correct_index": 2}}, {"id": "tinyml-0445", "title": "The Tensor Arena Hard Fault", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you avoid the 1.12 MB FP32 temporary from a 280 KB INT8 tensor on a limited SRAM budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply aggressive weight pruning to the largest layers to reduce the overall model size.", "Quantize the final layer to INT8, likely using QAT to preserve its accuracy.", "Re-architect the model to use smaller layers at the end of the network.", "Request a hardware change to a microcontroller with at least 1.5MB of SRAM."], "correct_index": 1}}, {"id": "tinyml-0446", "title": "The SRAM Budget Overflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does replacing the layer with a depthwise separable convolution solve peak activation SRAM usage?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The parameter count is reduced by ~8.4x, so both memory and latency will decrease by ~8.4x, solving the problem.", "The layer is memory-bound; since parameters are stored in Flash, not SRAM, the change has no effect on the memory issue.", "The change reduces parameters and FLOPs by ~8.4x, but peak activation memory is unchanged at ~432 KiB; nevertheless, this already fits within the 2 MB SRAM.", "The FLOPs are reduced from ~339 MFLOPS to ~40 MFLOPS, but this increases latency because more, smaller operations are less efficient."], "correct_index": 2}}, {"id": "tinyml-0451", "title": "The Secure Doorbell A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What A/B testing strategy fits both the 200 KB and 450 KB models while preserving privacy and rollback safety?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Stream images to the cloud to run the new model server-side, allowing for rapid iteration without touching the device.", "Use a dual-partition OTA scheme, writing the new model to an inactive partition and swapping on boot for maximum safety.", "Overwrite the old model with the new one on 50% of devices; the risk of bricking is acceptable for a test rollout.", "Store both models concurrently in the available flash and use a runtime flag to switch between them, as there is sufficient space (278KB left for app logic)."], "correct_index": 3}}, {"id": "tinyml-0452", "title": "The ADC Overflow Anomaly", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do raw 12-bit I2S microphone values collapse an INT8 model calibrated on normalized [-1, 1] audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is too complex and is running out of SRAM on the device, causing memory corruption.", "The Cortex-M4 CPU doesn't have the necessary SIMD instructions to correctly handle INT8 math, leading to calculation errors.", "The quantization scale is mismatched with the raw ADC data range, causing all inputs to clip to the INT8 max value.", "The I2S microphone's clock speed is out of sync with the MCU's, causing dropped bits and corrupted input frames."], "correct_index": 2}}, {"id": "tinyml-0453", "title": "The Cafeteria False Wake", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the mechanical reason the INT8 model fails so spectacularly in a noisy environment when the identical FP32 architecture was robust?", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The cafeteria noise is causing power brownouts on the device, leading to random bit-flips in the model's weights.", "The calibration dataset lacked noisy examples, causing quantization to clip noisy activations and make them indistinguishable from the keyword.", "The FP32 model was likely overfitting to the clean dataset, and the INT8 model is simply exposing this pre-existing weakness.", "The INT8 model requires more SRAM than is available, and the stack is colliding with the heap, corrupting the activation tensors during inference."], "correct_index": 1}}, {"id": "tinyml-0454", "title": "The Depthwise Separable Switcheroo", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the parameter reduction to determine if replacing the 3x3 convolution with a depthwise separable convolution is a valid optimization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["It's a bad trade-off; the parameter savings are minimal (~10-20%).", "It reduces parameters by about 50%, which is a good starting point.", "It provides a massive parameter reduction of ~85%, making it an excellent optimization strategy.", "It will not change the parameter count, it only reduces the required computation (FLOPs)."], "correct_index": 2}}, {"id": "tinyml-0455", "title": "The Vision Transformer Memory Trap", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which person-detection model fits a 512 KB Cortex-M7: MobileNetV2 activations or a ViT attention matrix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT, because it has fewer parameters and is more modern.", "The CNN, because the ViT's quadratic N-by-N attention score matrices exceed the MCU's SRAM.", "Both will fit easily, as 512KB is plenty of memory for either model architecture.", "Neither will fit; both CNNs and ViTs require multiple megabytes of SRAM for vision tasks."], "correct_index": 1}}, {"id": "tinyml-0456", "title": "The Neural Architecture Search Power Puzzle", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which NAS candidate meets the 2.5 mW average power budget once duty cycle is considered?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 1}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model C, because it has the lowest active power consumption (10mW).", "Model A, because its short inference time results in the lowest average power (2.0mW).", "Model B, as it offers the best balance between active power and inference time.", "None of the models meet the budget, as their active power all exceeds 2.5mW."], "correct_index": 1}}, {"id": "tinyml-0460", "title": "The Silent Saturation", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do loud keyword samples collapse an INT8 Cortex-M4 model from 95% to 60% accuracy after PTQ?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's tensor arena is overflowing the SRAM due to larger intermediate buffers required for INT8.", "Critical weight precision was lost during quantization, corrupting the model's core feature extractors.", "The calibration dataset lacks sufficient dynamic range, causing activation values to overflow the INT8 range on loud inputs.", "The Cortex-M4 lacks a floating-point unit, causing emulation errors when de-quantizing intermediate results."], "correct_index": 2}}, {"id": "tinyml-0461", "title": "The Mixed-Precision Power Budget", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you solve this trade-off using a mixed-precision approach given the constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use the fully INT8 model but perform data augmentation to make it more robust to high-g events.", "Keep the first two layers in FP32 and quantize the remaining 90% of the model to INT8.", "Underclock the Cortex-M7 when running the full FP32 model to fit the power budget.", "Implement the full model in INT16 to get a balance of precision and performance."], "correct_index": 1}}, {"id": "tinyml-0463", "title": "The Vision Transformer Memory Explosion", "topic": "attention-scaling", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 96x96 ViT with 2x2 patches overflow 1 MB SRAM on the first attention block?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ViT model has too many parameters to fit in 1MB of Flash.", "The self-attention mechanism requires storing a quadratically-scaling attention matrix in SRAM, which overflows the memory.", "The Cortex-M7 CPU does not have hardware acceleration for the softmax operation in the attention block.", "The parameter count exceeds SRAM capacity (21.2MB)."], "correct_index": 1}}, {"id": "tinyml-0464", "title": "The Power-Aware NAS Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which NAS candidate should be selected under the 1 mW average power constraint, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Model A is the correct choice; its average power is ~0.51 mW (under budget), so we can prioritize its higher accuracy.", "Model B is the correct choice; its average power is ~0.21 mW, and you must always select the lowest power model regardless of the budget.", "Neither model works; Model A draws ~1.01 mW and Model B draws ~1.005 mW, both over the 1 mW budget.", "Both models work, but their power consumption is dominated by the 10 uW sleep state."], "correct_index": 0}}, {"id": "tinyml-0466", "title": "The Privacy-Power Trade-off", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is transmitting 64 KB raw audio or spending 25 mJ to send a 2 KB summary more energy-efficient?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Cloud Upload is more efficient; the on-device compute adds a 25mJ overhead that should be avoided.", "On-Device is 32x more efficient, since the data payload is reduced from 64KB to 2KB.", "On-Device is ~9.1x more efficient, since the total energy drops from 320mJ to 35mJ.", "Both options are roughly equivalent in energy cost once you factor in both compute and networking."], "correct_index": 2}}, {"id": "tinyml-0467", "title": "The Federated Thermostat A/B Test", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the monthly net economic impact of federated learning for 1M thermostats after user savings, support savings, and server cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A net benefit of ~$180,000/month, reflecting user savings minus server costs.", "A net loss, because the $20,000/month server cost is a significant new expense.", "A net benefit of ~$230,000/month, from both user savings and reduced support tickets.", "A net benefit of ~$30,000/month, reflecting only the support ticket savings minus server costs."], "correct_index": 2}}, {"id": "tinyml-0468", "title": "The Noisy Kitchen Problem", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What most likely causes the INT8 keyword-spotting model to fail in the noisy kitchen despite strong lab accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model is overfit to the clean training data and cannot generalize to noisy environments.", "The Cortex-M4's computational power is insufficient for running the CNN on noisy audio, causing missed inferences.", "The model's INT8 quantization range, calibrated on clean audio, is too narrow for the noisy kitchen environment, causing input values to saturate.", "The increased noise in the audio signal causes larger activation tensors, leading to an SRAM memory overflow in the tensor arena."], "correct_index": 2}}, {"id": "tinyml-0470", "title": "The Battery-Powered A/B Test", "topic": "ota-firmware-updates", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which saves more battery for 1M smart doorbells: a 500 KB OTA model download over WiFi, or a 20 KB federated update followed by 15 s of on-device training at 100% CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["OTA Update; the 15 seconds of 100% CPU utilization for on-device training is the biggest power drain.", "Federated Fine-Tuning; it is more energy-efficient because the energy saved by the shorter radio time for the small upload far outweighs the energy spent on local computation.", "They are roughly equivalent; the total active time for both tasks is similar (~16 seconds), so the battery impact will be negligible.", "OTA Update; transferring 500 KB of verified firmware from the cloud is inherently more secure and reliable than running on-device training, justifying the energy cost."], "correct_index": 1}}, {"id": "tinyml-0471", "title": "The Real-Time Interrupt Stall", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What scheduling fault causes the vibration analysis to miss its 1 ms deadline despite sufficient MCU compute?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The FFT model (0.3 MFLOPs) is too computationally expensive for the Cortex-M4 MCU to meet the 1ms deadline.", "The 2MB of available SRAM is insufficient to hold both the FFT model's tensors and the temperature task's state.", "The low-priority temperature check is preempting the high-priority vibration analysis, causing it to miss its deadline due to the combined execution time.", "The 168 MHz clock speed of the MCU is the bottleneck; upgrading to a 480 MHz Cortex-M7 would solve the problem."], "correct_index": 2}}, {"id": "tinyml-0473", "title": "The Silent Factory Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a Cortex-M4 INT8 KWS model clamp factory-floor activations to 127 after clean-speech PTQ calibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has a memory leak in the convolution operator, causing a buffer overflow.", "The Cortex-M4 CPU does not have enough computing power (MFLOPS) to execute the model in real-time.", "The calibration dataset used for PTQ did not represent the dynamic range of real-world audio inputs, causing activation overflow.", "The model should have been quantized to FP16 instead of INT8, as FP16 has higher precision."], "correct_index": 2}}, {"id": "tinyml-0474", "title": "The Doorbell Latency Crisis", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which architectural change most effectively solves this compute bottleneck based on quantitative analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Prune 50% of the filters in the standard convolution.", "Apply INT8 post-training quantization.", "Replace the standard convolution with a depthwise separable convolution.", "Replace the CNN layer with a small MobileViT-style attention block."], "correct_index": 2}}, {"id": "tinyml-0475", "title": "The Overwhelmed Sensor Fusion MCU", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does 11 ms of CPU work per 10 ms sensor window make the Cortex-M7 miss a 2 ms shutdown deadline?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 0}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M7's instruction cache is thrashing due to context switching between the audio and accelerometer tasks.", "The model is too large, causing an SRAM overflow which leads to system resets and missed data.", "The total required processing time (11ms) exceeds the data arrival interval (10ms), creating an unstable queue and growing latency.", "The SPI bus connecting the sensors to the MCU is saturated and cannot deliver the data fast enough."], "correct_index": 2}}, {"id": "tinyml-0476", "title": "The Desert Drone Reboot", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do Cortex-M7 delivery drones reboot when desert landing vision pushes a 150 ms pipeline past a 200 ms watchdog?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The desert heat is causing the MCU to thermally throttle, slowing down execution until it misses the watchdog deadline.", "The vision pipeline's latency has increased due to data drift from the new desert environment, exceeding the 200ms watchdog budget.", "A memory leak in the preprocessing code is causing `malloc` to occasionally take longer than 200ms, triggering the watchdog.", "The drone's power management system is unstable during landing, causing a voltage drop that resets the MCU."], "correct_index": 1}}, {"id": "tinyml-0477", "title": "The Silent Saturator", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What quantization failure mode explains loud-field input failures after calibration on quiet office audio?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 1}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4's DSP instructions have a bug with INT8 convolutions that manifests under high-magnitude inputs.", "General precision loss from the FP32-to-INT8 conversion is too severe, making the model inherently unstable.", "The calibration dataset was not representative of real-world inputs, causing activation overflow for loud signals.", "Weight overflow occurred during the initial quantization of the model's parameters, corrupting a key layer."], "correct_index": 2}}, {"id": "tinyml-0479", "title": "The PCIe-Powered Sensor", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which statement best identifies the primary bottleneck and core flaw of adding a PCIe accelerator to a TinyML sensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The PCIe Transaction Layer Packet (TLP) overhead would add unacceptable latency compared to the direct memory access nature of the microcontroller's SPI bus.", "The microcontroller's GPIO pins cannot be physically configured to support the SerDes lanes required for the PCIe protocol, making it a driver and hardware compatibility issue.", "The power consumption of the PCIe interface is orders of magnitude too high for the device's battery-powered budget, and its bandwidth is unnecessary for a single audio stream.", "The main problem is that NVLink would be a better choice than PCIe for connecting the accelerator, as its higher bandwidth and GPU-centric design are better suited for ML workloads."], "correct_index": 2}}, {"id": "tinyml-0480", "title": "The Real-Time Queueing Cascade", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which event misses the 35 ms deadline when 20 ms vibration inferences arrive at 0, 10, and 20 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["All events will be processed on time, since the 20ms processing time is well under the 35ms deadline.", "The system will crash due to an SRAM overflow from having to buffer three events simultaneously.", "The third event misses its deadline because the queueing delay from the first two events pushes its completion time past its deadline.", "A real-time operating system (RTOS) would use preemption to pause the earlier events, ensuring all three deadlines are met."], "correct_index": 2}}, {"id": "tinyml-0481", "title": "The Silent Mic Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do loud clips collapse an INT8 KWS model to under 10% accuracy after quiet-audio calibration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The activation memory required by the loud inputs is overflowing the device's 256KB SRAM, causing data corruption.", "The Cortex-M4's instruction set is failing to correctly execute the quantized operations, leading to arithmetic errors on loud inputs.", "The calibration dataset was not representative of real-world audio levels, causing activation values to clip at the INT8 maximum of 127.", "The model architecture is numerically unstable, and the smaller bit-width of INT8 magnifies pre-existing training issues."], "correct_index": 2}}, {"id": "tinyml-0482", "title": "The TinyML Transformer Trap", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does a 5x smaller Tiny-ViT OOM on a 256 KB Cortex-M4 when the 150 KB CNN fits perfectly?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4 is too weak. Propose upgrading the hardware to a mobile-class chip that has enough RAM to support a Transformer architecture.", "The model conversion process is buggy. With 5x fewer parameters, the Transformer should be much smaller. Focus on debugging the TFLite conversion script.", "The model needs more aggressive quantization. Apply post-training INT4 or INT2 quantization to the Transformer to reduce its memory footprint until it fits.", "The Transformer O(N^2) self-attention matrix is too large for the SRAM. Replace the CNN standard convolutions with depthwise separable convolutions to improve efficiency and accuracy within the memory budget."], "correct_index": 3}}, {"id": "tinyml-0486", "title": "The TinyML Latency Crisis", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is replacing the bottleneck convolution with a depthwise separable convolution sufficient to meet the 100 ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["No, the bottleneck on a Cortex-M4 is always memory bandwidth (low Ops/Byte ratio), so reducing FLOPs will not significantly impact latency.", "No, the reduction is only about 2x, bringing latency down to ~75ms, but adding the pointwise layer overhead will push it back over 100ms.", "Yes, the change reduces computation by ~8.4x, bringing the new total latency to ~41ms, which is within the 100ms budget.", "Yes, it will meet the deadline because parameter count is reduced by ~8.4x, which translates directly to a latency reduction of the same factor."], "correct_index": 2}}, {"id": "tinyml-0500", "title": "The Silent Activation Overflow", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely cause of this catastrophic accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The weight quantization process introduced too much error, shifting the model's decision boundary.", "The microcontroller's specific instruction set for INT8 convolutions has a bug, producing incorrect results.", "The calibration dataset was not representative of real-world audio, causing activation values to overflow the narrow INT8 dynamic range.", "The device has insufficient SRAM, causing the tensor arena memory to be corrupted during inference."], "correct_index": 2}}, {"id": "tinyml-0503", "title": "The Predictive Maintenance TCO Dilemma: Federated Learning", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Should Model B be deployed after accounting for failure savings and the added sensor energy cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Do not deploy. The new model adds over $300,000 in annual energy costs, making it too expensive.", "Deploy. The TCO decreases by approximately $200,000 annually due to the reduction in false positives.", "Deploy. The TCO decreases by approximately $4.2 million annually, as failure cost savings vastly outweigh the minimal increase in power cost.", "Do not deploy. The marginal improvement in accuracy does not justify the complexity of a federated learning deployment."], "correct_index": 2}}, {"id": "tinyml-0505", "title": "The Smart Doorbell's Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural replacement best addresses both latency and memory problems in the smart doorbell convolution layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply INT8 post-training quantization to the layer. It will cut the memory footprint by 4x and is simple to implement.", "Replace the layer with a small Vision Transformer block, as they are state-of-the-art for vision tasks.", "Replace the standard convolution with a 3x3 depthwise separable convolution to reduce computation by ~8.4x.", "Use unstructured weight pruning to remove 85% of the connections, as this will create a sparse and efficient layer."], "correct_index": 2}}, {"id": "tinyml-0508", "title": "The Keyword Spotting Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which path should you choose for the Cortex-M4 keyword spotting model, micro-ViT or NAS with depthwise separable CNNs?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has too many parameters. Using depthwise separable convolutions reduces the parameter count, which will solve our problem by making the model smaller for the flash.", "The model is too slow. The ViT architecture is more modern and can be heavily quantized to INT4 to fit within the memory and latency budget.", "The bottleneck is peak SRAM usage (Tensor Arena). The ViT is infeasible due to its quadratic attention complexity. We must switch to depthwise separable convolutions to reduce computation by ~8x and the resulting activation memory footprint.", "The SRAM usage is too high. We should implement a Mixture-of-Experts (MoE) layer to ensure only a fraction of the model is executed per inference, which is the standard way to scale down large models."], "correct_index": 2}}, {"id": "tinyml-0511", "title": "The TinyML Keyword Spotting Dilemma", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L2", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which candidate architecture should you choose under the 256 KB tensor arena and 100 ms latency constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0515", "title": "The Flash Memory Diet", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is replacing the standard convolution with a depthwise separable convolution better than slashing output channels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The junior engineer is right. Slashing output channels from 128 to 16 is the simplest and most direct way to achieve the required ~8x size reduction.", "Use a smaller kernel, like 1x1, instead of 3x3. This reduces parameters without changing the channel depth.", "Replace the standard convolution with a depthwise separable convolution. It gets a similar layer-size reduction while using a pointwise projection to keep a 128-channel output, retaining more accuracy.", "Apply 4-bit integer quantization to the existing layer. This will reduce the model size by 4x, which is the most significant saving possible."], "correct_index": 2}}, {"id": "tinyml-0525", "title": "The Federated Wake-Word TCO", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which yearly energy TCO is lower for improving the wake-word model, centralized audio upload or federated on-device training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The federated approach has a higher TCO because the 10-second training time is 10x longer than the 1-second upload time, leading to higher energy consumption.", "The centralized approach is overwhelmingly more expensive due to the massive cloud ingress and storage costs from 1 million devices, making the on-device energy cost irrelevant.", "The centralized approach has a higher energy TCO because the Wi-Fi radio's power draw (1W) is 20x higher than the MCU's (50mW), making its total energy per event 2x greater despite the shorter duration.", "The energy TCO difference is negligible as it amounts to fractions of a cent per device, so the decision should be based purely on privacy and implementation complexity, not economics."], "correct_index": 2}}, {"id": "tinyml-0526", "title": "The Silent Wake-Word Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the INT8 Hey Lumi model fail in car noise after PTQ calibration on quiet audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The calibration dataset did not include noisy, in-car audio, causing activation values to exceed the calibrated range and 'clip' during quantization.", "The Cortex-M4 microcontroller lacks the necessary DSP instructions to perform INT8 convolutions efficiently, corrupting the output.", "The model architecture uses a SiLU (Swish) activation function, which is non-saturating and is known to cause numerical instability during INT8 conversion.", "The weight values were corrupted during the FP32-to-INT8 conversion. The model must be retrained from scratch using Quantization-Aware Training (QAT)."], "correct_index": 0}}, {"id": "tinyml-0530", "title": "The Silent Overflow Catastrophe", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does clean-audio PTQ calibration make noisy +8.0 activations collapse a Cortex-M4 KWS model to random accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's architecture uses activation functions that are fundamentally incompatible with 8-bit integers.", "Weight quantization error was too high, destroying the model's learned representations across all layers.", "The calibration dataset was not representative, causing activation value clipping during inference on real-world data.", "The Cortex-M4's DSP instructions have a bug in the INT8 convolution kernel, leading to incorrect matrix multiplications."], "correct_index": 2}}, {"id": "tinyml-0533", "title": "The Factory Floor Cascade Failure", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is the 80 MFLOP vibration model viable under the 50 ms hard real-time deadline on the Cortex-M7?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The event queue size is too small to handle the burst. Increasing the queue size would solve the problem.", "The Cortex-M7 is thermally throttling under the burst load, reducing its effective FLOPS.", "The per-event processing time (83.3ms) exceeds the real-time deadline (50ms), making the system unstable even for a single event.", "The average arrival rate (1 Hz) is much lower than the service rate (~12 Hz), so the issue is a software bug in the event handler, not a performance bottleneck."], "correct_index": 2}}, {"id": "tinyml-0534", "title": "The Silent Doorbell Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why do loud close-range doorbell activations clamp to -128 in a 256 KB Flash INT8 keyword model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model has overfit to the clean lab audio and cannot generalize to noisy field conditions.", "The microphone's analog-to-digital converter is clipping the raw audio waveform before it reaches the MCU.", "The calibration dataset for INT8 quantization was not representative, causing activation values from loud sounds to overflow.", "The Tensor Arena in SRAM is too small, and the larger activations from loud sounds are causing memory corruption."], "correct_index": 2}}, {"id": "tinyml-0535", "title": "The Keyword Spotting Memory Blowout", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What FP32 parameter memory savings result from replacing the standard 3x3 convolution with a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The standard convolution requires ~72 KB, while the depthwise separable version requires only ~1.1 KB, an almost 66x reduction.", "The device is compute-bound, not memory-bound. A Cortex-M4 cannot execute this many FLOPs, so changing convolution type is irrelevant.", "The standard convolution requires ~72 KB for parameters, while the depthwise separable version requires ~9.1 KB, an almost 8x reduction.", "The standard convolution requires ~18 KB. The depthwise separable version requires ~2.3 KB. The savings are not significant enough."], "correct_index": 2}}, {"id": "tinyml-0537", "title": "The Smart Doorbell's Update Dilemma", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which monthly update plan has lower TCO for 1M doorbells: raw-audio OTA or federated gradients?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 2}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Centralized OTA, because the on-device compute cost of FL would be too high and drain the battery.", "Centralized OTA, because it's more secure; a signed firmware image prevents tampering, unlike FL which is vulnerable to data poisoning.", "Federated Learning, because it has a significantly lower data transfer TCO and preserves user privacy by not uploading raw audio.", "The annual costs for both are under $500, so they are negligible. Choose Centralized OTA for its simpler implementation."], "correct_index": 2}}, {"id": "tinyml-0539", "title": "The Saturated Microphone", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you fix INT8 activation saturation when noisy audio drives first-layer values from [-10,10] to [-90,90]?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Add a clipping function before the first layer to cap activations at 10.0.", "Recompile the model using FP16 precision for the first layer (mixed precision).", "Re-run the quantization calibration with a dataset that includes noisy audio samples.", "Increase the number of channels in the first convolutional layer to better capture features."], "correct_index": 2}}, {"id": "tinyml-0540", "title": "The Transformer's Memory Trap", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the Transformer architecture riskier than the CNN under the Cortex-M4's 256 KB SRAM budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Transformer's MAC count is too high for the Cortex-M4's clock speed, causing a latency violation.", "Transformer runtimes have a large fixed memory overhead, which when added to the 150KB of parameters, exceeds the 256KB budget.", "The activation memory scales quadratically with sequence length due to the self-attention matrix, creating a single large tensor that consumes the SRAM budget.", "The attention mechanism requires FP32 weights to function correctly, making the true parameter size 600KB (150K * 4)."], "correct_index": 2}}, {"id": "tinyml-0545", "title": "The Dual-Keyword Memory Overflow", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What architectural change should you make when doubling KWS output channels threatens the Cortex-M4 SRAM budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The activation memory is the bottleneck. The input audio features must be downsampled to a lower resolution.", "The Cortex-M4 is not powerful enough. The device requires a hardware upgrade to a Cortex-M7 with more SRAM.", "Replace the standard convolution with a depthwise separable convolution to increase capacity while reducing parameter count.", "Keep the larger standard convolution but apply 80% unstructured weight pruning to fit it into memory."], "correct_index": 2}}, {"id": "tinyml-0555", "title": "The Clock Tree Surprise", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How is running slower using *more* energy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 16 MHz clock reduces dynamic power proportionally to 10 mW, yielding a total energy of 1.0 mJ per inference, saving battery life.", "Running at 16 MHz uses 1.5 mJ per inference because dynamic power scales linearly with frequency but active time scales inversely, while accumulating 5x more leakage energy.", "Static (leakage) power is voltage-dependent. At 16 MHz the MCU stays active 5x longer (100 ms vs 20 ms), accumulating 1.51 mJ total energy compared to 1.01 mJ for race-to-sleep.", "The 16 MHz clock reduces the active power to 15 mW, saving 35 mW, and yielding a lower total energy of 0.3 mJ per inference."], "correct_index": 2}}, {"id": "tinyml-0556", "title": "The Branch Prediction Penalty on MCU", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a branch-based INT8 ReLU consume ~38% of Cortex-M4 inference time for 500,000 activations?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 1}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M4 has a branch predictor that mispredicts ReLU branches 50% of the time, causing 10-cycle pipeline flushes on each miss.", "The Cortex-M4 has no dynamic branch predictor. With ~50% negative activations, every taken branch flushes the 3-stage pipeline (1-3 penalty cycles per element). For 500K activations, this wastes ~750K cycles. Fix: use a branchless USAT instruction or CMSIS-NN's vectorized SIMD ReLU for up to 10x speedup.", "The compiler is generating floating-point comparison instructions for the INT8 values, triggering a software emulation trap on each comparison since the M4 has no FPU.", "The ReLU function is memory-bound because each activation requires a cache miss to load from SRAM, and the Cortex-M4's single-cycle SRAM interface cannot keep up."], "correct_index": 1}}, {"id": "tinyml-0561", "title": "SPI DMA Single-Buffer Sample Drops", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does single-buffer SPI DMA at 1.6 kHz drop samples during 30-50 ms Cortex-M4 inference, and how do you fix it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0563", "title": "The Lookup Table Optimization", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the memory cost, the speedup, and when this optimization breaks down?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0564", "title": "The RP2040 Dual-Core ML", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can two 133 MHz RP2040 cores halve an 80 ms audio classifier, or does SRAM bus contention limit speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0565", "title": "The nRF5340 Network Core Split", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you partition memory and processing, and what happens if the BLE stack on the network core needs to interrupt the application core mid-inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0566", "title": "The Mel Spectrogram Compute Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Does 40-bin Mel feature extraction on a 168 MHz Cortex-M4 fit beside a 15 ms model in a 100 ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0573", "title": "On-Device Data Collection for Retraining", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How much triggered anomaly data can 2,000 Cortex-M4 vibration sensors store in 114 KB flash and upload over BLE?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use raw continuous logging: stream 6.4 KB/s of sensor data to flash, filling 114 KB in 17.8 seconds, then upload via BLE. This gives the ML team the full fidelity data they need for retraining.", "Use triggered + compressed logging: record only anomaly events (800 bytes/event, ~10/day = 8,000 bytes/day). Each 114 KiB log holds about 14.6 days. BLE upload at 60 KiB/s takes 1.9s per device. For 2,000 devices through 10 gateways serving one device each: about 6.3 minutes total. This yields about 292,000 labeled edge cases (233 MB decimal) despite each device having only 114 KiB of logging space.", "Upgrade to a microcontroller with more flash (16 MB) to store weeks of raw sensor data before uploading.", "Use federated learning to retrain the model on-device, eliminating the need to collect and upload any raw sensor data to the cloud."], "correct_index": 1}}, {"id": "tinyml-0575", "title": "The SPI Bus Capacitance Limit", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does adding four IMUs to a 20 MHz SPI bus corrupt data even when firmware and chip-selects are correct?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The SPI clock of 20 MHz exceeds the maximum supported frequency when multiple slave devices are present on the same bus.", "Adding 4 IMUs increases parasitic bus capacitance ~4x. The RC time constant rises from ~8ns to ~30ns. At 20 MHz, the signal cannot reach the 3.3V logic threshold in time.", "The SPI chip select lines are experiencing cross-talk at 20 MHz.", "The MCU's SPI peripheral has a hardware bug that corrupts data when more than 2 slave devices share the same bus."], "correct_index": 1}}, {"id": "tinyml-0578", "title": "The I2C Clock Stretching Deadlock", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a crashed I2C sensor hold SCL low forever and freeze an MCU in a wait loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0584", "title": "Cortex-M55 + Ethos-U55 + Cortex-A32 — Which Core Runs What?", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which Alif Ensemble E7 compute element should run detection, tracking, and H.264 streaming, and why?", "chain_ids": ["tinyml-chain-auto-016-08"], "chain_positions": {"tinyml-chain-auto-016-08": 0}, "chain_tiers": {"tinyml-chain-auto-016-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run all three tasks on the Cortex-A32 since it has the highest clock speed, Linux support, and can multitask with threads. The NPU and M55 add unnecessary complexity.", "Assign person detection to Ethos-U55 (128 MACs/cycle, 20.9ms, 20 mW — 357x more energy-efficient than A32 for this workload). Assign H.264 encoding to Cortex-A32 (needs MMU + caches for codec complexity, 30ms at 200 mW). Assign object tracking to Cortex-M55 (lightweight Kalman filter, 1ms at 15 mW). This heterogeneous split saves ~2.24W vs running everything on A32.", "Assign all ML tasks (detection + tracking) to the Ethos-U55 NPU and use the A32 only for H.264 encoding. The M55 should remain idle to save power.", "Assign person detection to Cortex-M55 with Helium (it has SIMD for CNNs), H.264 to Cortex-A32, and use the Ethos-U55 NPU for tracking since it can maintain state across frames."], "correct_index": 1}}, {"id": "tinyml-0585", "title": "Updating 400 KB FP32 and 100 KB INT8 Models Over BLE 5.0", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Per device, how do FP32 and INT8 model formats change OTA transfer time, bandwidth, and radio energy?", "chain_ids": ["tinyml-chain-auto-secondary-004-11"], "chain_positions": {"tinyml-chain-auto-secondary-004-11": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-11": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Quantization format doesn't affect OTA size because the model is compressed during transmission regardless of the weight precision.", "INT8 quantization reduces OTA model size by 4x (400 KB to 100 KB), cutting BLE transfer time from 8s to 2s and energy per update from 211 mJ to 52.8 mJ.", "The OTA size reduction from quantization is irrelevant because BLE 5.0's built-in compression handles the FP32 model efficiently.", "INT8 reduces OTA size by 2x (not 4x) because the TFLite flatbuffer format adds metadata overhead that scales with the number of parameters, partially negating the precision reduction."], "correct_index": 1}}, {"id": "tinyml-0587", "title": "Duty Cycle for Energy Harvesting Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the maximum inference rate (inferences per minute) the energy budget supports?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 1}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Maximum rate: 200 uW / 30 mW = 0.67% duty cycle. At 12ms per inference: 0.0067 x 60,000ms/min / 12ms = 33 inferences/min.", "Energy per cycle: inference (360 uJ) + sensor (10 uJ) + BLE TX (800 uJ) = 1,170 uJ. Effective harvest: 200 uW x 70% = 140 uW. Period: 1,170/140 = 8.36s. Rate: ~7/min. BLE dominates (68%). Optimization: batch 10 results per BLE packet, amortizing BLE to 80 uJ/inference. New rate: ~18/min (2.5x improvement).", "The vibration harvester cannot sustain any inferences because 200 uW is insufficient to power the Cortex-M4F's minimum active current of 30 mW.", "Maximum rate: 60 inferences/min. The harvester generates 200 uW continuously, and inference costs only 360 uJ, so the energy budget is limited solely by the 12ms inference time."], "correct_index": 1}}, {"id": "tinyml-0588", "title": "The Floating Point Sensor Tax", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does float scaling an integer sensor stream cost more than the first three INT8 layers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0591", "title": "INT8 Clipping from Factory Calibration Mismatch", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What quantization failure explains the factory-floor accuracy collapse, and what mixed-precision fix fits the SRAM budget?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 2}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0592", "title": "The Ghost in the Microcontroller", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the intermittent close-range radar detection failure in the INT8 automotive model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0594", "title": "The Saturation Catastrophe", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What calibration problem causes the INT8 keyword-spotting model to collapse in noisy factory audio?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0596", "title": "The Silent Drift Catastrophe", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What cascading failure is caused by checkpointing adaptive normalization statistics to Flash every 100 inferences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0597", "title": "The Instruction Cache Thrashing Loop", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Where do the missing 3.1 ms go when a Cortex-M7 depthwise convolution takes 4.2 ms instead of 1.1 ms?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 3}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0601", "title": "The Watchdog Interrupt Starvation", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does moving an 800ms Cortex-M4 anomaly model into a timer ISR make a 500ms watchdog reset the device?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 3}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0602", "title": "The SRAM Bank Collision", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does camera DMA into SRAM1 slow Cortex-M7 inference by 25% even though DMA uses zero CPU cycles?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0603", "title": "The MCU Throughput Ceiling", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can a 168 MHz Cortex-M4 sustain 10 inferences per second for a 10M-MAC model while serving sensors and UART?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 4}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0604", "title": "The Operator Support Gap", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should MobileNetV2 on a Cortex-M4 replace Resize Bilinear, Pad, and Swish to keep CMSIS-NN coverage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0605", "title": "The MAX78000 CNN Accelerator", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you explain architecturally why the accelerator achieves 30-100x better energy efficiency, and identify what workloads it cannot accelerate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0607", "title": "The Energy Harvesting Inference Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How many inferences per hour can you sustain, and what happens during a cloudy day when light drops to 50 lux?", "chain_ids": ["tinyml-chain-auto-019-08"], "chain_positions": {"tinyml-chain-auto-019-08": 2}, "chain_tiers": {"tinyml-chain-auto-019-08": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["At 500 lux: 27 inferences/hour. At 50 lux: 0 inferences/hour because the 1 uW harvest exactly equals sleep power.", "The solar cell generates enough energy at 50 lux for 5 inferences/hour.", "At 500 lux: 30 inferences/hour (36 mJ / 1.2 mJ).", "The system can sustain 27 inferences/hour regardless of light level."], "correct_index": 0}}, {"id": "tinyml-0610", "title": "Fusing Accelerometer + Microphone + Temperature on One MCU", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design the memory layout, DMA strategy, and scheduling for three sensor branches in 256 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-010-15"], "chain_positions": {"tinyml-chain-auto-secondary-010-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Run three separate models sequentially, each with its own 120 KB tensor arena and 80 KB weights in SRAM, totaling 360 KB SRAM for arenas alone.", "Use a shared-backbone multi-head architecture: shared backbone (100 KB flash) + 3 small heads (30 KB total flash). Single tensor arena: 180 KB SRAM. DMA strategy: mic uses double-buffer (16 kHz continuous), accelerometer uses single-buffer (1 kHz bursts), temp uses polled I2C (1 Hz). Schedule work around the mic buffer cadence and run lower-rate branches between audio windows. Total SRAM: 240 KB / 256 KB. This avoids the 360 KB SRAM requirement of three separate models.", "Use external PSRAM to hold all three tensor arenas, accessing them via SPI. The 2-3x latency penalty is acceptable since the temperature model only runs once per second.", "Time-multiplex a single model across all three sensor streams by reloading different weights from flash for each sensor, running the mic model at 62.5 Hz, accelerometer at 1 Hz, and temperature at 0.017 Hz."], "correct_index": 1}}, {"id": "tinyml-0611", "title": "Co-Designing a TinyML Accelerator", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What sub-0.5 mm² INT8 accelerator blocks would beat CMSIS-NN on a Cortex-M4 by 10x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Build a mini-GPU with 16 streaming multiprocessors (0.4 mm²) to handle all floating-point and integer ops.", "Build a weight-stationary systolic array with 64 INT8 MACs, 32KB weight SRAM, and 8KB activation SRAM (~0.38 mm²). Leave activations, pooling, and control flow to the M4.", "Add a 512KB L2 cache (0.45 mm²) to the Cortex-M4 to solve the CMSIS-NN memory bandwidth bottleneck.", "Implement a dedicated FP16 matrix multiplication unit (0.2 mm²) that offloads the entire network execution from the M4."], "correct_index": 1}}, {"id": "tinyml-0613", "title": "NPU Delegation Coverage Determines Actual Speedup", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "With 2.0 ms CPU-only time and four 50 us fallback transitions, what speedup follows from 95% compute delegation?", "chain_ids": ["tinyml-chain-auto-016-08"], "chain_positions": {"tinyml-chain-auto-016-08": 1}, "chain_tiers": {"tinyml-chain-auto-016-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Speedup is ~5x as expected. 82% of layers on the NPU means 82% of compute is accelerated, giving 1/(1-0.82) = 5.6x by Amdahl's Law.", "Actual speedup is ~6.7x (not 5x or 20x). Layer count (82%) misleads because compute delegation is 95% (28.5M/30M MACs). Naive Amdahl's predicts 1/0.05 = 20x, but each NPU-to-CPU fallback transition costs ~50 us of data transfer dead time. With 4 transitions: ~200 us overhead, plus 100 us of remaining CPU compute, so 2.0 ms / 0.3 ms ≈ 6.7x.", "Speedup is ~20x because 95% of compute MACs run on the NPU, which is 20x faster than the M55 CPU per MAC.", "Speedup is only ~2x because the Ethos-U55's 128 MACs/cycle throughput is bottlenecked by the narrow data bus between the NPU and M55, limiting effective bandwidth to 10% of peak."], "correct_index": 1}}, {"id": "tinyml-0614", "title": "Sub-threshold Voltage Operation — Power vs Speed Trade-off", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the Apollo4 meet the 200ms deadline, and what is the energy savings compared to a standard Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Apollo4 cannot meet the 200ms deadline because sub-threshold operation at 0.5V reduces the maximum clock to ~10 MHz, making inference take over 1 second.", "The Apollo4 at 0.5V/96 MHz meets the deadline easily: 1M MACs at 2 MACs/cycle = 5.2ms inference (194.8ms headroom). Power: 3 uW/MHz x 96 = 288 uW (vs standard M4 at 50 mW). Energy per inference: 1.5 uJ vs 150 uJ — 100x more efficient. Sub-threshold operation is transformative when the workload fits within the reduced clock speed's deadline.", "Power scales as V^2, so 1.2V to 0.5V gives (1.2/0.5)^2 = 5.76x power reduction at the same 168 MHz clock speed, and the Apollo4 meets the deadline with a 5.76x power savings.", "The Apollo4 meets the deadline but the power savings is only 2x because sub-threshold operation increases leakage current, which offsets most of the dynamic power reduction."], "correct_index": 1}}, {"id": "tinyml-0617", "title": "The Input-Dependent Watchdog Reset", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did an optimization that improved average-case compute latency lead to a catastrophic, input-dependent failure?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 2}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0618", "title": "The Watchdog Reboot Loop", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might the system be failing in this non-linear way, and why is the playbook's recommendation likely wrong for this specific failure?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0622", "title": "The Checkpoint-Watchdog Death Spiral", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does the watchdog and checkpoint design enter a deterministic reboot loop during cold-start inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0623", "title": "The OTA Wear-Out Catastrophe", "topic": "ota-firmware-updates", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you assess this OTA design and predict the specific, time-delayed physical failure mode that caused this mass-bricking event?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0627", "title": "The Night-Rain Quantization Failure", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might it be a dangerous trap, and what more robust system-level solution should you justify to your team?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0628", "title": "The Watchdog and the Unseen Workload", "topic": "monitoring-observability", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the root causes of a localized inference latency increase causing watchdog reboots?", "chain_ids": ["tinyml-chain-auto-secondary-017-67"], "chain_positions": {"tinyml-chain-auto-secondary-017-67": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-67": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0629", "title": "The Fault-Tolerant Battery Killer", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What flash-write energy miscalculation causes the checkpointing sensors to drain batteries in under 3 days?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0634", "title": "The Deaf Automobile Watchdog", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is simply enabling a watchdog timer an insufficient fix for the vehicle voice assistant going deaf?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A watchdog timer requires a dedicated RTOS to function correctly, which is missing from this bare-metal system.", "The system is experiencing unmonitored data drift. Bad inputs cause NaN/Inf propagation that hangs the model. A watchdog just creates an infinite reboot loop.", "A 2-second watchdog is too short for an automotive environment and will frequently trigger during normal operations like a cold engine start.", "The watchdog resets the entire vehicle's CAN bus, which violates automotive safety integrity level (ASIL) standards."], "correct_index": 1}}, {"id": "tinyml-0635", "title": "The Silent Sensor Death Spiral", "topic": "real-time-deadlines", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why would a model update that passed memory checks cause a catastrophic watchdog loop, and why would it only affect a small subset of the fleet?", "chain_ids": ["tinyml-chain-auto-024-11"], "chain_positions": {"tinyml-chain-auto-024-11": 4}, "chain_tiers": {"tinyml-chain-auto-024-11": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0636", "title": "The Federated Learning Battery Drain Catastrophe", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why did a modest increase in model size and accuracy lead to a catastrophic, non-linear failure in battery life, and what critical TCO factor did the team's plan ignore?", "chain_ids": ["tinyml-chain-auto-025-04"], "chain_positions": {"tinyml-chain-auto-025-04": 3}, "chain_tiers": {"tinyml-chain-auto-025-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0640", "title": "The Hypersensitive Wake-Word", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What dynamic range mismatch causes the INT8 wake-word model to become hypersensitive to loud background noise?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0641", "title": "The Watchdog Boot Loop Catastrophe", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why is the proposal to lengthen the watchdog timeout likely wrong, what is the probable root cause of the failure, and what catastrophic, non-linear fleet behavior is this masking?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0643", "title": "The Keyword Spotting Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the complete inference pipeline for an always-on keyword spotting system on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0644", "title": "The SIMD Lane Starvation", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why do Cortex-M55 Helium SIMD instructions yield only a 1.6x speedup when memory is accessed via byte-by-byte loads?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 5}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The Cortex-M55's Helium unit has a 4-cycle latency for vector multiply-accumulate instructions, so processing 4 elements takes 4 cycles — only marginally better than scalar's 4 cycles.", "The SIMD execution unit was starved by scalar memory loads. The CPU can multiply 4 INT8 values in 1 cycle, but loading them byte-by-byte (LDRB) takes 4 cycles. Fix: use a packed, vector-friendly data layout and aligned load path so four bytes can be loaded efficiently before feeding the SIMD unit. Proper packed loads achieve the ideal 4x path in this model; scalar loads limit it to 1.6x.", "The Cortex-M55 compiler auto-vectorizes the scalar loop identically to the SIMD intrinsics, so there is no performance difference between the two implementations.", "The 1.6x speedup is correct and expected — SIMD on Cortex-M only helps with floating-point operations, not 8-bit integer arithmetic."], "correct_index": 1}}, {"id": "tinyml-0645", "title": "The Cache-Line False Sharing", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does Core 0's Cortex-M7 ML inference slow by 30% when Core 1 writes independent SRAM variables?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0646", "title": "The MCU Roofline", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Where is the ridge point, and what does it tell you about which models are feasible?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 4}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The ridge point is at ~100 Ops/Byte (similar to GPUs), meaning most TinyML models are memory-bound.", "The ridge point is 2.0 Ops/Byte. Conv2D (~18 Ops/Byte) and depthwise (~9 Ops/Byte) are above it, making them compute-bound.", "The roofline model doesn't apply to MCUs because they lack a cache hierarchy.", "The ridge point is at ~0.1 Ops/Byte because MCU memory bandwidth far exceeds compute throughput."], "correct_index": 1}}, {"id": "tinyml-0647", "title": "The MCU NAS Search Space", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What constraints must your search space encode that a standard NAS for desktop/cloud would ignore?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 2}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0649", "title": "MCUNet Search Space Design", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you design the search space to find a competitive model in 6x less time?", "chain_ids": ["tinyml-chain-auto-016-07"], "chain_positions": {"tinyml-chain-auto-016-07": 3}, "chain_tiers": {"tinyml-chain-auto-016-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0650", "title": "Sub-Milliwatt Always-On Wake Word Detection", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architecture keeps always-on wake-word detection under 1 mW including microphone, ADC, features, and inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a Cortex-M4 at the lowest clock speed (4 MHz) to stay under 1 mW. At 4 MHz, the MCU draws ~1.5 mW active, plus 0.5 mW mic + 0.2 mW ADC = 2.2 mW total.", "A tiered always-on architecture: Stage 1 — analog VAD or ultra-low-power DSP (~20-50 uW) screens for speech-like audio 100% of the time. Stage 2 — lightweight feature extractor (~200 uW) activates only when VAD triggers (~10% of time). Stage 3 — full neural network (~1.5 mW) runs only on likely wake words (~1% of time). Average power: ~85 uW.", "Use a standard Cortex-M4 running the full neural network continuously but power-gate the microphone between inference cycles to stay under 1 mW.", "Achieve sub-milliwatt by using INT4 quantization to reduce the model's compute by 4x, bringing the M4's inference power from 2 mW to 0.5 mW."], "correct_index": 1}}, {"id": "tinyml-0652", "title": "INT8-to-Float Dequantization Trap", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why does INT8-to-float dequantization stay slow even with a Cortex-M4F hardware FPU?", "chain_ids": ["tinyml-chain-auto-009-07"], "chain_positions": {"tinyml-chain-auto-009-07": 3}, "chain_tiers": {"tinyml-chain-auto-009-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0654", "title": "The Contextual Awareness Crash", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What memory management architecture avoids the TFLite Micro single-arena OOM when switching between wake-word and speaker ID models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0655", "title": "The Zero-Copy Race Condition", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a complete, robust data flow architecture that achieves a safe, zero-copy pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0657", "title": "The DMA Energy Break-Even Point", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What is the DMA-vs-CPU-copy energy break-even size, and is a 9,216-byte camera frame above it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0658", "title": "The Siren's Screech: Designing a Robust Hearing Aid", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What mixed-precision deployment strategy keeps the hearing-aid model within the SRAM and latency budgets while avoiding overflow?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 4}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0663", "title": "The Conversational Doorbell's Memory Deficit", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What SRAM-first strategy makes the KV cache feasible without paging it through Flash?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 5}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0667", "title": "The Look-aside Attention Cache", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three design decisions for a look-aside KV-cache architecture on a 512KB SRAM MCU, and how do you justify them quantitatively?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0670", "title": "The Ghost in the Dashboard", "topic": "monitoring-observability", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What three-part on-device architecture can detect humid-climate audio drift, disable the model, and store a diagnostic fingerprint?", "chain_ids": ["tinyml-chain-auto-secondary-017-67"], "chain_positions": {"tinyml-chain-auto-secondary-017-67": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-017-67": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0671", "title": "The Redundant Vision Failure", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can two Cortex-M7 vision MCUs detect silent weight corruption and hot-reload within a 100 ms frame budget using only on-chip resources?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0672", "title": "The Automotive Sensor Fusion Overflow", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Propose a quantization strategy for the two branches of the network, and determine what the dominant factor in your energy budget calculation is: compute or memory access?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0673", "title": "The Streaming Sensor Fusion Dilemma", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which decisions on context, model size, quantization, and SRAM layout make this MCU design feasible?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Apply virtual memory by using external flash as swap space, paging KV-cache layers in and out as needed, similar to how mobile operating systems handle memory pressure.", "Three architectural decisions: (1) Reduce context from 512 to a 64-token sliding window plus a compact summary state. (2) Reduce the deployable model to a small SRAM-resident variant, such as 4 layers, and quantize KV-cache to INT8: 2 * 64 * 4 * 128 * 1 = 64 KB. (3) Use a fixed SRAM arena for KV ring, activations, and sensor buffers with no Flash-backed paging in the 1ms loop. Example SRAM: 64 KB KV + 96 KB activations/features + 32 KB stack/RTOS margin = 192 KB, leaving ~64 KB headroom.", "Quantize the entire model from FP16 to INT4, reducing the KV-cache from 4 MiB to 1 MiB. Then use aggressive structured pruning to remove 75% of attention heads, bringing it to 256 KB.", "The junior team is correct — it is impossible. A transformer with 16 layers and 512-token KV-cache fundamentally cannot run on a 256 KB SRAM MCU. The requirement should be changed to a simpler RNN-based model."], "correct_index": 1}}, {"id": "tinyml-0674", "title": "The Silent Failure of the In-Car AI", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are your first three architectural decisions, and how do you justify them with quantitative analysis?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0675", "title": "The Concurrent Wake-Word Crisis", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the scheduler and tensor arenas be designed so two TinyML models can run without additive SRAM residency?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 3}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0678", "title": "The Private Factory Floor", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can federated learning reduce false positives for FactorySense while staying within Cortex-M4 battery and business constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0680", "title": "Federated Learning Strategy for Low-Light Driver Monitoring", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid strategy of centralized low-light data, fleet-wide federated updates, and on-device power constraints reduces false positives across the 1M-vehicle driver-monitoring fleet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0681", "title": "The Silent Failure of the Emergency Keyword", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What root-cause analysis and mitigation plan should be used after INT8 quantization makes the emergency KWS model fail in noise?", "chain_ids": ["tinyml-chain-auto-009-04"], "chain_positions": {"tinyml-chain-auto-009-04": 3}, "chain_tiers": {"tinyml-chain-auto-009-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0682", "title": "The Federated Wearable ROI Proposal", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What ROI and power analysis justifies or limits federated learning for the smart tremor-detection patch?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0683", "title": "The TinyML Conversational AI Memory Wall", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system handle conversational context and follow-up questions within the strict 480 KB SRAM and 2 MB Flash constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0684", "title": "The Billion-Dollar Doorbell Breach", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What phased response should mitigate the ML accelerator side-channel vulnerability without a physical recall?", "chain_ids": ["tinyml-chain-auto-025-05"], "chain_positions": {"tinyml-chain-auto-025-05": 3}, "chain_tiers": {"tinyml-chain-auto-025-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0685", "title": "The Dusk Disaster: Quantization-Aware Architecture", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What quantization-aware architecture plan should address the AEB model's dusk failures within MCU memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0686", "title": "Automotive Camera Occlusion Detection Within a 1 ms Deadline", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How do you propose to detect this occlusion on-device, and what is the recovery strategy without violating the 1ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0689", "title": "The Guardian-Node Dilemma: TCO vs. Security for an On-Device Learning System", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What hybrid update strategy balances TCO and security for Guardian-Node devices without a secure element?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0696", "title": "The Flat Memory Reality", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why should a Cortex-M4 not malloc activation buffers from a 256 KB bare-metal SRAM heap during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0697", "title": "SPI Polling Busy-Wait Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does busy-wait SPI polling make Cortex-M0+ vibration inference 3x slower than expected?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 1}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0698", "title": "The Flash-SRAM Boundary", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why can Cortex-M4 weights stay in 1 MB flash while activations must fit in 256 KB SRAM?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 2}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0699", "title": "Why TFLite Micro Uses a Static Tensor Arena on Cortex-M0+", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does TFLite Micro require a static tensor arena instead of malloc on a small Cortex-M0+, and how does dynamic allocation affect WCET analysis?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 2}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0700", "title": "The TFLite Micro Arena Sizing", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do you determine the minimum arena size without trial and error, and why is it not simply the sum of all tensor sizes?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0702", "title": "SRAM Needed for MobileNet Activations", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you estimate the peak SRAM required for the activation tensors and determine if the model fits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The model's 3.4 MB of parameters don't fit in 512 KB SRAM, so the model cannot run on this hardware without significant compression.", "\"Parameters (3.4 MB) live in flash, not SRAM. SRAM holds activations only. Peak activation at 128x128 INT8 MobileNetV2 is 448 KB (expansion layer input 64 KB + output 384 KB). With 80 KB firmware overhead, available SRAM is 432 KB — deficit of 16 KB. Fix: use width multiplier 0.75x, reducing peak to 336 KB (fits with 96 KB headroom) at only 2% accuracy loss (69.8% vs 71.8%).\"", "MobileNetV2 fits easily because INT8 quantization reduces activation memory by 4x compared to FP32, bringing peak SRAM to ~108 KB.", "The model fits if you use in-place operations for all depthwise convolution layers, which eliminates the need for separate input and output activation buffers."], "correct_index": 1}}, {"id": "tinyml-0703", "title": "Flash Wear from Logging Frequency", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "With a naive pointer update that erases one 4 KB metadata sector per 32-byte log write, how long will 100K-cycle NOR last at 10 inferences/s?", "chain_ids": ["tinyml-chain-auto-009-06"], "chain_positions": {"tinyml-chain-auto-009-06": 0}, "chain_tiers": {"tinyml-chain-auto-009-06": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["19.8 years (calculating 200 GB total divided by 27.6 MB/day).", "~2.8 hours because naive metadata pointer updates erase a 4 KB sector on every write.", "Indefinitely, because modern NOR flash has built-in wear leveling.", "17.4 hours until the flash fills up capacity-wise."], "correct_index": 1}}, {"id": "tinyml-0704", "title": "The FreeRTOS Heap Exhaustion", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why did spawning and deleting an ML task per image cause heap exhaustion despite having plenty of SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0705", "title": "TFLite Micro Unaligned Access HardFault on Cortex-M0+", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does TFLite Micro's packed tensor memory layout cause unaligned access on the Cortex-M0+, and why does the ISA constraint force you to pad your ML tensors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0706", "title": "The DMA Channel Collision", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does the ML inference's DMA bandwidth requirement conflict with the sensor DMA, and how can priority inversion resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0707", "title": "The TFLite Micro Heap Overhead", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does AllocateTensors() fail when a 32 KB MCU gives TFLite Micro exactly a 15 KB tensor arena?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 3}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0708", "title": "The CMSIS-NN Alignment Fault", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an unaligned int8_t tensor_arena[20000] crash CMSIS-NN with a Hard Fault?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0709", "title": "The Tensor Arena Overflow", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can a 210 KB tensor arena fit when a Cortex-M4 has only 200 KB of SRAM available, and what must change if the model architecture cannot be changed?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 3}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0710", "title": "The Double Buffering DMA Strategy", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a DMA double-buffering scheme and prove mathematically that it eliminates data loss?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 2}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Use a large circular buffer (32 KB) in SRAM. The CPU will always catch up because inference (40ms) is faster than the fill time. No synchronization needed.", "Allocate two 3,072-byte buffers (512 samples x 6 bytes/sample). Configure STM32 DMA in double-buffer mode (DBM bit). DMA fills Buffer A while CPU processes Buffer B, then swap. Fill time: 160ms. Process time: 40ms. Utilization: 25%. This mathematically guarantees no data loss because process time < fill time, and the buffers are hardware-isolated — DMA physically cannot write to the buffer the CPU is reading.", "Use a single buffer with an interrupt flag: the DMA sets a flag when complete, and the CPU polls the flag before reading. This prevents corruption because the CPU never reads during an active DMA transfer.", "Disable DMA during inference and re-enable it after. The 40ms gap in sensor data is acceptable because the vibration signal changes slowly at 3.2 kHz."], "correct_index": 1}}, {"id": "tinyml-0711", "title": "The Cache Miss Penalty", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you explain the 2.75x slowdown and estimate the total cache misses and average re-fetches per model cache line?", "chain_ids": ["tinyml-chain-auto-009-08"], "chain_positions": {"tinyml-chain-auto-009-08": 2}, "chain_tiers": {"tinyml-chain-auto-009-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The first run is slow due to TFLite Micro interpreter initialization overhead. Subsequent runs reuse cached interpreter state and skip the setup, achieving the 8ms baseline.", "The 400 KB model far exceeds the 16 KB D-cache. When executing from flash, severe cache thrashing occurs. 14ms of stall = ~6.7M penalty cycles = ~1.12M additional cache misses at a 6-cycle miss penalty. Across about 12,800 32-byte model cache lines, that is a rough average of ~87.5 extra fetches per line per inference, not a true miss/access rate. Fix: place the hottest layers in DTCM or improve locality to reduce repeated flash misses.", "The slowdown is caused by flash memory write-back operations that occur on first access. The flash controller must initialize its page buffers, which takes exactly 14ms for a 400 KB binary.", "The 2.75x slowdown is due to branch prediction cold-start. The Cortex-M7's branch predictor has no history on the first run, causing pipeline stalls on every conditional branch in the inference loop."], "correct_index": 1}}, {"id": "tinyml-0712", "title": "The STM32H7 Dual-Bank Flash", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does STM32H743 dual-bank flash let a 350 KB model update run over BLE while inference continues?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0714", "title": "MCU Flash Wear Monitoring", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the circular flash log be designed so the ML output cadence stays within the 10-year endurance budget?", "chain_ids": ["tinyml-chain-auto-009-06"], "chain_positions": {"tinyml-chain-auto-009-06": 1}, "chain_tiers": {"tinyml-chain-auto-009-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash has 100,000 write cycles — at 1 inference/second, that's enough for 100,000 seconds = 27.8 hours of operation before the flash wears out.", "At 1 Hz with 64 bytes/inference, the log writes ~5.27 MiB/day. Across a 64 KiB partition with even circular wear leveling, that is ~84.4 full-partition wraps/day, so each 4 KB sector sees ~84.4 P/E cycles/day and reaches 100K endurance in ~3.25 years. Fix: aggregate into 1-minute summaries of 32 bytes each, reducing writes to 45 KiB/day = 0.703 P/E cycles/sector/day = ~2,566 cycles over 10 years.", "Flash endurance is determined only by total bytes written. 64 bytes x 86,400 inferences/day x 3,650 days = 20.2 GB, so compare byte totals and ignore which 4 KB erase blocks are being cycled.", "Use the ESP32's built-in wear leveling library, which automatically distributes writes across all flash sectors, ensuring uniform wear and guaranteeing the 10-year lifetime regardless of inference output size."], "correct_index": 1}}, {"id": "tinyml-0715", "title": "Anomaly Detection on Streaming Sensor Data with Limited Memory", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you fit the streaming pipeline, the model, and the baseline statistics in 256 KB SRAM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Store the last 60 seconds of raw sensor data (1.15 MB) as the baseline reference for anomaly comparison, using external PSRAM to overcome the 256 KB SRAM limit.", "Use exponential moving average (EMA) statistics instead of raw data for the baseline. Memory layout utilizes 218.4 KB (85.3%) of the 256 KB SRAM.", "Reduce the sensor sampling rate from 3.2 kHz to 800 Hz to fit 60 seconds of raw baseline data (288 KB) within the 256 KB SRAM budget.", "Run inference only on the most recent 1-second window with no baseline comparison, eliminating the need for storage."], "correct_index": 1}}, {"id": "tinyml-0716", "title": "Execute-in-Place vs Copy-to-SRAM for Model Weights", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the STM32H7 copy all 500 KB of INT8 weights to SRAM, or only the 350 KB of large layers that thrash the M7 D-cache?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 2}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Copy all 500 KB of weights to SRAM before inference. SRAM is 5x faster than flash, so inference speeds up by 5x.", "Selective SRAM copy. Copy only the 5 large layers (350 KB) to SRAM to achieve a 1.87x speedup while saving 150 KB compared to a full copy.", "Keep all weights in flash with XIP. The M7's ART Accelerator handles sequential reads perfectly.", "Use the M7's ITCM (64 KB) for the most frequently accessed weight layers and keep the rest in flash."], "correct_index": 1}}, {"id": "tinyml-0717", "title": "DMA Transfer Time vs Inference Time", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can the system run in real-time without dropping audio samples, and what's the critical timing constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The system cannot run in real-time because DMA and CPU share the AHB bus, and the 32 KB/s DMA transfer rate consumes most of the available bandwidth.", "Yes, real-time is feasible. Fill time: 1024 samples / 16 kHz = 64ms. Process time: MFCC (3ms) + inference (15ms) = 18ms. Since 18ms < 64ms, the CPU finishes processing Buffer A before DMA fills Buffer B. CPU utilization: 28%. Critical constraint: process time must always be less than fill time. At 48 kHz sampling (21.3ms fill), utilization rises to 85% — danger zone for jitter.", "The system drops samples because the I2S DMA transfer consumes 50% of the AHB bus bandwidth, leaving insufficient bandwidth for the CPU to access the tensor arena during inference.", "Real-time is possible only if the MFCC feature extraction is offloaded to a hardware DSP accelerator, freeing the CPU to focus exclusively on the 15ms neural network inference."], "correct_index": 1}}, {"id": "tinyml-0718", "title": "Multi-Model SRAM Partitioning", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can VAD, KWS, and command models with 8 KB, 45 KB, and 60 KB arenas coexist in 512 KB Cortex-M7 SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Load all three models' weights into SRAM (20 + 80 + 120 = 220 KB) plus three separate activation arenas (8 + 45 + 60 = 113 KB) = 333 KB total. This incorrectly counts flash-resident weights against the SRAM arena budget.", "Exploit the cascading trigger pattern: VAD runs continuously, KWS only on VAD trigger, CMD only on KWS trigger. Since KWS and CMD never run simultaneously, share one arena: max(45, 60) = 60 KB. Total activation SRAM: 8 KB (VAD, always resident) + 60 KB (shared KWS/CMD) = 68 KB. Weights live in flash, not SRAM, so 68 KB is the SRAM coexistence test against the 512 KB budget.", "All three models must have dedicated, simultaneously-allocated arenas because the audio pipeline requires continuous processing and cannot be interrupted for context switching.", "Use a single 60 KB arena for all three models, swapping model weights from flash between each inference. The 0.5ms weight-loading overhead is negligible compared to inference time."], "correct_index": 1}}, {"id": "tinyml-0720", "title": "The Peak RAM Puzzle", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How can an 8-layer CNN needing 300 KB peak activation RAM fit in 256 KB of Cortex-M7 SRAM?", "chain_ids": ["tinyml-chain-auto-009-01"], "chain_positions": {"tinyml-chain-auto-009-01": 4}, "chain_tiers": {"tinyml-chain-auto-009-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0721", "title": "The Non-Volatile MRAM Trap", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does putting the tensor arena in Apollo4 MRAM drain the battery in 2 days instead of 14?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0722", "title": "The ITCM Execution Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does compiling a custom ML C++ kernel to QSPI Flash cause a 600 MHz Cortex-M7 CPU to effectively run at quarter speed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0723", "title": "The L1 Cache Miss Penalty", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does writing to memory with a massive stride destroy performance on an MCU with an L1 Data Cache?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0724", "title": "The Execute-in-Place Energy Tax", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What physical hardware reality did you ignore that causes reading 1 MB over SPI NOR Flash to drain a battery in 2 days?", "chain_ids": ["tinyml-chain-auto-009-09"], "chain_positions": {"tinyml-chain-auto-009-09": 3}, "chain_tiers": {"tinyml-chain-auto-009-09": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0725", "title": "The Multi-Tenant MCU", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an nRF5340 schedule 80 KB KWS and 140 KB command arenas when both must fit in 230 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Allocate 80 KB + 140 KB = 220 KB for both arenas simultaneously. 230 KB available, leaving 10 KB headroom — sufficient for both models to coexist.", "Time-multiplex the physical arena memory. During normal listening, run KWS with its 80 KB arena. When CMD triggers, pause KWS and repurpose the arena region for CMD's 140 KB arena. Peak arena SRAM: max(80 KB, 140 KB) = 140 KB, leaving 90 KB for stack, ISR frames, and BLE buffers. If KWS and CMD had to execute concurrently, this reuse would not be valid.", "Run both models in a single TFLite Micro interpreter by concatenating the model graphs. The interpreter will automatically manage memory sharing between the two models.", "Allocate the full 230 KB as a shared arena and let TFLite Micro's dynamic memory planner handle the allocation for both models during concurrent execution."], "correct_index": 1}}, {"id": "tinyml-0726", "title": "The SRAM Bank Conflict Slowdown", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does doubling STM32H7 depthwise channels from 64 to 128 cause a 2.76x slowdown instead of 2x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0727", "title": "The DMA Buffer Corruption", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does Cortex-M7 DMA audio in AXI SRAM produce one confident wrong KWS result every 500 inferences?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0728", "title": "The Flash Read Disturb", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a vibration model reading 400 KB weights from flash lose precision after 6 months until re-flashed?", "chain_ids": ["tinyml-chain-auto-009-06"], "chain_positions": {"tinyml-chain-auto-009-06": 2}, "chain_tiers": {"tinyml-chain-auto-009-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Flash doesn't wear out from reading — only writes cause wear. The accuracy drop must be from a firmware bug that accumulated sensor calibration drift over 6 months.", "NOR flash read disturb: the 400 KB model is read via XIP at 10 inferences/second, accumulating ~1.24 billion reads per cell over 6 months. This exceeds the ~500M read-disturb threshold for 90nm NOR flash, causing ~15% of cells in hot pages to flip bits. The ~480K flipped bits create ~3 LSB average weight noise on INT8 values, matching the observed 12% accuracy drop. Fix: periodically refresh (re-write) the model sectors every ~72 days, or move the model to MRAM/FRAM.", "The flash memory cells are slowly losing charge at room temperature (data retention degradation), causing random bit flips that accumulate over 6 months.", "The MCU's voltage regulator has drifted over 6 months, causing the flash read voltage to shift outside the optimal sensing window, producing intermittent read errors."], "correct_index": 1}}, {"id": "tinyml-0729", "title": "The SRAM Fragmentation Crash", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does AllocateTensors() fail to allocate a 300 KB arena when there is 600 KB of free SRAM?", "chain_ids": ["tinyml-chain-auto-009-02"], "chain_positions": {"tinyml-chain-auto-009-02": 4}, "chain_tiers": {"tinyml-chain-auto-009-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["If 600 KB is free and 300 KB is needed, the allocation should succeed. This must be a bug in TFLite Micro's arena allocator that fails on large allocations.", "Heap fragmentation from frequent model switching. Over 2,880 switches (48 hours), small allocations (BLE buffers, DMA descriptors, sensor logs) scatter across the heap. After 48 hours: 600 KB total free but no single contiguous 300 KB block. Fix: reserve a static 300 KB arena at boot (never freed), or use a pool allocator for small objects to prevent heap fragmentation.", "The RTOS is consuming additional SRAM through stack growth in background threads, reducing the actual free memory from 600 KB to below 300 KB over 48 hours.", "A memory leak in the model switching code allocates 200 bytes per switch without freeing, consuming 200 x 2,880 = 576 KB over 48 hours, leaving only 24 KB free."], "correct_index": 1}}, {"id": "tinyml-0730", "title": "The SPI DMA Cache Coherency Failure", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does a Cortex-M7 print stale zero values after the DMA successfully writes correct SPI camera pixels into RAM?", "chain_ids": ["tinyml-chain-auto-009-03"], "chain_positions": {"tinyml-chain-auto-009-03": 3}, "chain_tiers": {"tinyml-chain-auto-009-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0731", "title": "Flash Endurance Under Continuous Inference Logging", "topic": "vram-budgeting", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does the ML model's output dimensionality dictate the flash write rate, and how does the flash endurance budget determine the maximum model output complexity you can log?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0732", "title": "The Unaligned Struct Padding", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does struct { char sensor_id; int32_t prediction; char status; } write 12 bytes instead of 6?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0733", "title": "The Zero-Point Question", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is naive rounding incorrect for INT8 quantization, and what is the role of a zero-point?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0734", "title": "The Quantization Cliff", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does pushing a keyword model from INT8 to INT4 collapse accuracy from 91% to 74%, and how do you recover?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0735", "title": "The Fixed-Point Accumulator Overflow", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose the overflow condition in an INT32 accumulator for an INT8 Conv2D layer, and what is the maximum safe number of accumulations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["INT32 can hold +/- 2 billion. With only 4,608 accumulations (3x3x512), overflow is impossible in every part of the quantized layer regardless of bias or zero-point terms.", "Worst-case signed INT8 product is (-128) * (-128) = 16,384. Conv2D 3x3x512 has 4,608 accumulations, so the pure MAC sum is 75.5M versus INT32 max 2.15B. Maximum safe same-sign worst-case MAC count is floor((2^31 - 1) / 16,384) = 131,071. This layer's MACs are safe; check bias addition, zero-point compensation, or requantization if overflow is observed.", "The INT16 intermediate product overflows before reaching the INT32 accumulator. INT8 x INT8 = max 16,384, which exceeds INT16 max (32,767 signed), so every multiply produces garbage.", "The overflow is caused by the bias addition after accumulation, not the MAC operations themselves. The INT32 accumulator handles the MACs fine, but adding a large bias pushes the result past INT32 limits."], "correct_index": 1}}, {"id": "tinyml-0736", "title": "Quantization Error for INT4 on Cortex-M4", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate the accuracy impact of INT4 quantization and the actual inference speedup (or slowdown) on M4F hardware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["INT4 halves model size and doubles inference speed because you process twice as many values per 32-bit register, achieving 4 MACs/cycle on the M4F.", "INT4 on M4F is a lose-lose: no native INT4 instructions means unpacking to INT8/16 before arithmetic, making it ~2x slower than INT8 (35.8ms vs 17.9ms). Accuracy drops 3-8% (PTQ) or 3% (QAT). Better alternative: prune INT8 to 50% sparsity — same 75 KB size, ~90% accuracy, 9ms inference (4x faster than INT4). INT4 only wins on hardware with native sub-byte SIMD (M55 Helium, NPUs).", "INT4 quantization is always beneficial because it reduces both model size and activation memory by 2x, enabling a second model to fit alongside the keyword spotter.", "INT4 achieves the same accuracy as INT8 when using quantization-aware training, and the 2x size reduction is worth the marginal 10% inference slowdown from unpacking overhead."], "correct_index": 1}}, {"id": "tinyml-0737", "title": "The CMSIS-DSP FFT Scaling Bug", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can a 256-point fixed-point CMSIS-DSP FFT produce features scaled down by 1/256 relative to the training pipeline, and what scaling behavior did you forget to reverse?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0738", "title": "The Per-Channel Trade-off", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why might an MCU still choose per-tensor quantization after per-channel recovers only 3% accuracy?", "chain_ids": ["tinyml-chain-auto-secondary-004-17"], "chain_positions": {"tinyml-chain-auto-secondary-004-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-17": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Always use per-channel quantization — it's strictly better because it recovers 3% accuracy with no downsides on any hardware.", "'Per-channel often improves accuracy, but on an MCU it can add scale metadata, memory traffic, and per-channel requantization work unless the backend has a fast supported path. With an 85ms per-tensor baseline and a 90ms deadline, even a modest overhead can erase the timing margin. Choose per-tensor when latency or SRAM is tight; choose per-channel when the measured backend cost fits the budget.'", "Per-tensor is always better on MCUs because the single scale/zero-point fits in a register, so optimized kernels never need extra loads or requantization work.", "The 3% accuracy recovery from per-channel is a benchmark artifact — in real-world deployment on MCUs, per-channel and per-tensor achieve identical accuracy because the quantization noise is dominated by other factors."], "correct_index": 1}}, {"id": "tinyml-0739", "title": "BNN Speedup on Cortex-M4 with XNOR Popcount", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What actual speedup should a binary neural network achieve on Cortex-M4, and why is it not a full 32x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["BNNs execute 32 operations per 1-cycle XNOR, guaranteeing a 32x latency reduction.", "The actual speedup is ~1.6x due to the lack of hardware popcount on M4, requiring 10+ cycles per 32 operations, taking 0.93ms vs INT8's 1.49ms.", "BNNs achieve 32x speedup but fail to converge during training, blocking deployment.", "BNNs take ~60ms because the Cortex-M4 defaults to float point emulation for binary kernels."], "correct_index": 1}}, {"id": "tinyml-0740", "title": "The Integer Arithmetic Engine", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How does an integer-only quantized Conv2D execute on a Cortex-M4 from inputs through requantized outputs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Simply multiply INT8 inputs by INT8 weights, truncate to INT8, and output. The zero-points and scales are only needed during training, not inference.", "Full integer pipeline: accumulate INT8xINT8 products into INT32, subtract cross-terms involving zero-points, add INT32 bias. Then requantize via integer multiply and bit-shift, apply zero-point, and clamp to [-128, 127]. No floats anywhere.", "The Cortex-M4 must use software floating-point emulation for the scale factors during requantization, adding ~10 cycles per output element.", "The pipeline uses lookup tables (LUTs) for all multiplications: precompute all 256x256 possible INT8 products in a 64 KB table and replace multiplication with table lookups."], "correct_index": 1}}, {"id": "tinyml-0741", "title": "Shared TFLite Micro Arena Corruption", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does sharing a 10,240-byte TFLite Micro arena between STM32 inference and USB logging cause a hard fault?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0742", "title": "Oversized TFLite Micro Tensor Arena Failure", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the most likely root cause of garbage outputs when a 100,000-sample TFLite Micro input ignores allocation and operator status checks?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The 100,000-sample input exceeds the Cortex-M's maximum DMA transfer size.", "CMSIS-NN uses int16_t for dimensions. 100,000 wraps to -31,072. The loop exits immediately, leaving uninitialized memory.", "The huge input and intermediate tensors exceed the TFLite Micro arena or supported operator shape; if status checks are ignored, firmware may read an invalid or unproduced output buffer as garbage.", "The STM32's memory protection unit (MPU) triggers a bus fault."], "correct_index": 2}}, {"id": "tinyml-0743", "title": "CMSIS-NN Cortex-M4 Speedup for 3x3 Conv2D", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does CMSIS-NN beat plain C by about 8x on a 3x3, 32-to-64 channel Cortex-M4 Conv2D?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0744", "title": "Edge Impulse vs TFLite Micro Deployment", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "When should the team choose Edge Impulse versus direct TFLite Micro for the nRF52840 keyword spotting model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0745", "title": "The CMSIS-NN Speedup", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does CMSIS-NN cut Cortex-M4 INT8 matrix multiply latency from 45 ms to 6 ms without changing the clock?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["CMSIS-NN uses NEON SIMD instructions on the Cortex-M4, processing 4 INT8 values per cycle compared to the naive C code's scalar processing.", "CMSIS-NN exploits the M4's DSP extension with instructions such as SMLAD, which perform dual 16-bit MACs. INT8 kernels pack values and handle widening/sign extension so those DSP instructions, plus loop unrolling and SRAM-friendly access patterns, reach about 0.6 cycles/MAC effective throughput versus naive C's roughly 4.5 cycles/MAC.", "The speedup comes entirely from compiler auto-vectorization that the CMSIS-NN headers enable via special pragma directives, not from any ISA-level instructions.", "CMSIS-NN achieves 7.5x speedup by moving the matrix multiply computation to the DMA controller, which performs the arithmetic while the CPU handles other tasks."], "correct_index": 1}}, {"id": "tinyml-0746", "title": "The CMSIS-NN Transpose Overhead", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a PyTorch NCHW CNN spend 30% of Cortex-M4 inference time in TFLite Micro Transpose ops?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["TFLite Micro is inserting unnecessary Transpose ops due to a framework bug. Filing a bug report and rebuilding TFLite Micro from the latest source will eliminate the overhead.", "PyTorch uses NCHW layout; TFLite Micro/CMSIS-NN requires NHWC. The converter inserts Transpose ops between every layer to convert. Transposing 32x32x64 INT8 via strided reads: ~500K cycles per op (no data cache on M4). With 10 layers: ~10M wasted cycles (60ms at 168 MHz). Fix: export from PyTorch in NHWC (channels_last memory format) or convert layouts in ONNX before TFLite export, eliminating all runtime transpose ops.", "The transpose operations are caused by the model having batch normalization layers that require NCHW format, even though the convolutions use NHWC.", "The overhead is unavoidable because CMSIS-NN internally uses NCHW for all computations, requiring a layout conversion before and after every kernel call regardless of the input format."], "correct_index": 1}}, {"id": "tinyml-0747", "title": "The Int8 Asymmetric Zero-Point", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is (input - zp_in) * (weight - zp_wt) slower than a symmetric INT8 dot product in TFLite Micro?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["The zero-point subtraction is optional and can be disabled by setting the quantization mode to 'symmetric' in TFLite Micro's runtime configuration.", "Asymmetric quantization requires subtracting zero-points before every MAC: (input - zp_in) * (weight - zp_wt) = 3 instructions/MAC vs symmetric's 2 (input * weight when zp_wt = 0). For 1M MACs: 50% overhead (3M vs 2M instructions). Fix: use symmetric quantization for weights (force zp = 0 during QAT). CMSIS-NN optimizes symmetric weights via SMLAD (2 MACs/cycle); asymmetric falls back to scalar code.", "The extra subtraction instructions cause pipeline stalls on the Cortex-M4 because the SUB instruction has a 3-cycle latency that cannot be hidden by the in-order pipeline.", "The zero-point subtraction doubles memory bandwidth because each weight must be loaded twice — once for the subtraction and once for the multiplication."], "correct_index": 1}}, {"id": "tinyml-0750", "title": "The Float-to-Double Silent Promotion", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does float result = sample * 3.14159; take ~60 cycles on a Cortex-M4F?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0751", "title": "The TFLite Micro Resolving Pointer", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does looking up a tensor address via GetTensor() take 3ms on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["GetTensor() is slow because it reads the full tensor data from flash memory into SRAM on each call, wasting time copying data that's already accessible via XIP.", "TFLite Micro stores tensors in a FlatBuffer format. Each GetTensor() call parses offset tables, performs bounds checking, and resolves nested pointers. With poor cache locality on scattered vtable structures, each lookup incurs multiple cache misses. For 150 GetTensor calls per inference at ~1000 cycles each: 150K cycles = 1-3ms overhead. Fix: use TFLite Micro's static memory planner (pre-resolves all pointers at init) or cache resolved pointers in a lookup table after first inference.", "The 3ms overhead is caused by TFLite Micro's garbage collector, which runs between every layer to reclaim temporary activation buffers.", "GetTensor() acquires a mutex lock for thread safety on each call. The lock/unlock overhead of the RTOS accounts for the 3ms because the RTOS scheduler runs on every mutex release."], "correct_index": 1}}, {"id": "tinyml-0752", "title": "Conv-BN-ReLU Fusion on Cortex-M7 for SRAM Reduction", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How much SRAM and SRAM traffic can fusing Conv2D, BatchNorm, and ReLU save on a Cortex-M7 MobileNet block?", "chain_ids": ["tinyml-chain-auto-secondary-004-32"], "chain_positions": {"tinyml-chain-auto-secondary-004-32": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0754", "title": "TFLite Micro vs TVM vs Custom Compiler", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you compare them across five dimensions: code size, inference speed, memory efficiency, portability, and engineering effort?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0755", "title": "TinyML Flash Budget for Compressing a Person Detector", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Which compression options can reduce the 1.2 MB FP32 person detector to fit within the 800 KB Flash budget?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0756", "title": "Continuous Learning on MCU", "topic": "model-adaptation-systems", "competency_area": "parallelism", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "Is it economically and computationally feasible to perform on-device incremental learning on the Cortex-M7 rather than pushing a full model update via satellite?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0758", "title": "TinyML Memory Hierarchy", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "On a typical $2 microcontroller used for TinyML (like an ARM Cortex-M), what are the two main types of memory, and what are they used for?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["HBM for weights and NVMe for activations.", "Flash memory for read-only model weights, and SRAM for intermediate activations.", "L3 Cache for weights and DDR4 for activations.", "Virtual memory backed by a cloud server."], "correct_index": 1}}, {"id": "tinyml-0759", "title": "Integer-Only Inference", "topic": "mcu-compute-constraints", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why do frameworks like TFLM heavily emphasize integer-only (INT8) operations instead of FP32?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Because microcontrollers cannot physically read 32-bit words from memory.", "Integer math is mathematically proven to be more accurate for audio processing.", "Many microcontrollers lack hardware Floating Point Units; emulating floats in software is too slow and power-hungry.", "INT8 prevents the microcontroller from being hacked via buffer overflows."], "correct_index": 2}}, {"id": "tinyml-0761", "title": "The Sensor Buffer Overflow", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "How much SRAM does the input buffer require, and what fraction of total SRAM does it consume?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["1,000 bytes (~0.4% of SRAM)", "3,000 bytes (~1.1% of SRAM)", "6,000 bytes (~2.3% of SRAM)", "30,000 bytes (~11.4% of SRAM)"], "correct_index": 1}}, {"id": "tinyml-0762", "title": "The INT8 Quantization Memory Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "How much Flash memory does INT8 quantization save, and will the quantized model fit in 1 MB of Flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 250 KB saved (25%), INT8 model is 750 KB", "B) 500 KB saved (50%), INT8 model is 500 KB", "C) 750 KB saved (75%), INT8 model is 250 KB", "D) 875 KB saved (87.5%), INT8 model is 125 KB"], "correct_index": 2}}, {"id": "tinyml-0763", "title": "Sigmoid LUT vs Math Library on Cortex-M4", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If a model layer has 1,000 activations per inference, how many cycles does each approach use, and what is the speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) LUT is 4× faster (30,000 vs 120,000 cycles)", "B) LUT is 10× faster (12,000 vs 120,000 cycles)", "C) LUT is 40× faster (3,000 vs 120,000 cycles)", "D) LUT is 120× faster (1,000 vs 120,000 cycles)"], "correct_index": 2}}, {"id": "tinyml-0764", "title": "The MFCC Feature Memory Budget", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How would you calculate the total SRAM needed for the feature matrix that feeds the neural network?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 160 bytes (40 features × 4 bytes)", "B) 1,320 bytes (33 frames × 40 features × 1 byte)", "C) 5,280 bytes (33 frames × 40 features × 4 bytes)", "D) 32,000 bytes (raw audio for 1 second)"], "correct_index": 1}}, {"id": "tinyml-0765", "title": "The Double Buffer DMA Pipeline", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "What throughput and CPU utilization result when DMA takes 1.28 ms and CPU feature extraction takes 0.9 ms per audio frame?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 58.7 kHz throughput, 41% CPU utilization", "B) 100 kHz throughput, 70% CPU utilization", "C) 100 kHz throughput, 100% CPU utilization", "D) 142 kHz throughput, 50% CPU utilization"], "correct_index": 1}}, {"id": "tinyml-0766", "title": "The Depthwise Separable Ops Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "How many multiply-accumulate operations does each approach require, and what is the reduction factor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["Standard: 75.5M MACs, Separable: 37.7M MACs (2x reduction)", "Standard: 75.5M MACs, Separable: 18.9M MACs (4x reduction)", "Standard: 75.5M MACs, Separable: 8.98M MACs (8.4x reduction)", "Standard: 75.5M MACs, Separable: 0.59M MACs (128x reduction)"], "correct_index": 2}}, {"id": "tinyml-0767", "title": "The Pruning Sparsity Threshold", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "Why doesn't the 5x weight reduction translate to a 5x speedup, and what sparsity pattern would help?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) Unstructured gives 5x speedup; structured gives 2x", "B) Both give 5x speedup — the bottleneck is elsewhere", "C) Unstructured barely helps compute; structured gives the real speedup", "D) Neither helps on MCUs; only quantization reduces latency"], "correct_index": 2}}, {"id": "tinyml-0768", "title": "The Watchdog Timer Recovery", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "If the watchdog timeout is set to 500ms, what is the maximum downtime per hang event, and what is the risk of an aggressive 150ms timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 500ms max downtime; 150ms timeout is safe with 30ms margin", "B) 700ms max downtime; 150ms timeout risks false resets under ISR preemption", "C) 120ms max downtime; the watchdog resets instantly", "D) No downtime — the watchdog prevents hangs from occurring"], "correct_index": 1}}, {"id": "tinyml-0770", "title": "The CMSIS-NN SIMD Speedup", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you calculate the theoretical speedup from SIMD, and explain why the real speedup is closer to 2.5x instead of 4x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0776", "title": "The Knowledge Distillation MCU Fit", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What accuracy can you realistically expect from knowledge distillation at this compression ratio, and what are the memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0778", "title": "The Flash Write Endurance Limit", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "understand", "status": "published", "phase": "training", "question": "How many days until the sector wears out, and how would wear leveling across 8 sectors extend the lifetime?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": "", "options": ["A) 370 days (1 sector), 2,960 days (8 sectors)", "B) 37 days (1 sector), 296 days (8 sectors)", "C) 3.7 days (1 sector), 29.6 days (8 sectors)", "D) 10,000 days (1 sector), never wears out"], "correct_index": 1}}, {"id": "tinyml-0779", "title": "The Asymmetric Quantization Outlier Drop", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What calibration issue explains the recall drop for extreme sensor values after 8-bit quantization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0780", "title": "The CMSIS-NN Operator Fallback Latency", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the INT8 CNN become slower than FP32 despite the expected CMSIS-NN SIMD speedups?", "chain_ids": ["tinyml-chain-auto-009-05"], "chain_positions": {"tinyml-chain-auto-009-05": 3}, "chain_tiers": {"tinyml-chain-auto-009-05": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0781", "title": "The QAT Batch Normalization Folding Failure", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What QAT conversion error most likely causes the deployed INT8 Cortex-M4 model to output garbage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0782", "title": "The INT4 Dynamic Unpacking Overhead", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does INT4 quantization make inference slower and more power-hungry than INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0783", "title": "The 32-bit Accumulator Overflow", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do INT16 activations with INT8 weights overflow intermediate buffers during large dot products?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0784", "title": "The Single Batch Memory Bandwidth Wall", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Jetson Orin model lose utilization and miss latency targets when moving from batch size 32 to batch size 1?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0785", "title": "The Depthwise Separable Arithmetic Intensity Drop", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does replacing standard convolutions with depthwise separable convolutions barely improve Jetson Orin latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0786", "title": "The Flash Memory Wait State Bottleneck", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does overclocking the Cortex-M4 from 120 MHz to 168 MHz fail to reduce TinyML inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0787", "title": "The Unstructured Sparsity Memory Bloat", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning produce no speedup and higher memory use on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0788", "title": "PCIe Transaction Overhead in Edge-to-Cloud GPU Offloading", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is the local A100 underutilized when processing many tiny TinyML sensor payloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0790", "title": "SRAM Bank Conflicts During Concurrent DMA and CPU Access", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does DMA audio capture slow CPU inference by 50%, and how should SRAM banks be arranged?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0791", "title": "Shared L2 Thrash Between Concurrent Orin Models", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do two models that run in 10 ms alone spike to 45 ms latency when run concurrently on Jetson Orin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0792", "title": "Power Consumption Overhead of Execute-In-Place from Flash", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does executing dense model weights directly from Flash increase power and heat on the microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0793", "title": "Heap Fragmentation from Variable-Length Tensor Allocation", "topic": "mcu-compute-constraints", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does variable-length dynamic allocation eventually crash the Cortex-M4 despite enough total free SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0794", "title": "Cortex-M4 Brownout Resets During Convolution Current Bursts", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can the heaviest convolution layer cause random Cortex-M4 resets even when the software logic is correct?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0795", "title": "Thermal Throttling Under Continuous Edge ML Workloads", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Jetson Orin inference latency severely degrade after several minutes of continuous operation and only recover after an extended cool-down?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0796", "title": "SRAM Retention Leakage in Deep Sleep Mode", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What retained SRAM leakage explains the 20 microamp deep-sleep drain, and how can bank power-down meet the target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0798", "title": "Clock Drift and Feature Misalignment in Time-Series Models", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What sampling-rate drift in the new accelerometer batch makes normal movements look anomalous?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0799", "title": "RTOS Deadline Misses from Blocking CPU Polling I/O", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do camera frame captures cause 1 ms control-loop deadlines to be missed on the M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0800", "title": "False Positives from Seasonal Ambient Temperature Drift", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What long-term sensor drift causes the industrial temperature model to output continuous false positives?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0801", "title": "CPU Preprocessing Bottleneck in Edge Vision Pipelines", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What preprocessing bottleneck keeps the video pipeline below 15 FPS despite 5 ms NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0802", "title": "Flash Memory Exhaustion from Dual-Bank A/B OTA Updates", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does safe OTA deployment of a 200 KB model fail on a Cortex-M4 with 256 KB Flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0805", "title": "TFLite Micro Dynamic Interpreter Execution Overhead", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What source of TFLite Micro non-kernel overhead causes 30% of inference time to occur outside math kernels?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0806", "title": "Model Weight Corruption from Cross-Architecture Endianness Mismatch", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What endianness mismatch can make weights trained on x86 produce nonsense on a legacy DSP microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0808", "title": "Hardware Resets from Watchdog Timer Starvation During Inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the M4 reset during a 500 ms blocking TinyML inference, and how should inference be scheduled to avoid it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0809", "title": "Audio Buffer Overruns from Unsynchronized DMA Fill Rates", "topic": "data-pipeline-engineering", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why are audio predictions fluctuating when inference takes 40 ms but DMA fills the buffer every 32 ms, and what should be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0810", "title": "PID Loop Destabilization from RTOS Task Preemption Jitter", "topic": "transformer-systems-cost", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the drone jerk when the TinyML gesture task runs, and how should task priorities protect the PID loop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0811", "title": "Flash Capacity Exceeded by Aggressive Loop Unrolling Bloat", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does switching from -Os to -O3 make the Cortex-M4 binary fail to flash, and what optimization strategy should be used?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0812", "title": "UsageFaults from Unaligned 32-bit Memory Accesses", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does reading the 32-bit metadata field in the packed C struct trigger a HardFault on the Cortex-M4, and how can it be resolved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0814", "title": "TVM vs TFLite Micro SRAM OOM from Residual Buffer Planning", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the TVM-compiled ResNet OOM while TFLite Micro fits, and what memory planning issue should be corrected?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0816", "title": "Wake-Word Duty Cycle Evaluation Under Power Constraints", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the maximum allowable sampling frequency to meet the power budget, and does the architecture satisfy the latency SLO?", "visual": {"kind": "svg", "path": "tinyml-0816.svg", "alt": "Log-scale timeline diagram showing active power spikes.", "caption": "Power profile of the wake-word sensor duty cycle."}, "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 4}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0818", "title": "Acoustic Monitor Power Budgeting", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Evaluate the exact CPU cycle requirements and design a system frequency configuration to ensure the device remains under a 15mW active compute power budget?", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0819", "title": "BLE Transmission Overlap Diagnosis", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the latency failure in the current sequential pipeline and design a specification to overlap operations using hardware peripherals?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0828", "title": "Intermittent Computing FRAM Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the overhead of this checkpointing mechanism if the model has 10 layers, each producing a 10KB output tensor, and writing to FRAM takes 2 microseconds per byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0841", "title": "Implement depthwise separable conv for Cortex-M4 with CMSIS-NN", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the three-layer depthwise separable CNN be implemented with CMSIS-NN and buffered to fit STM32F4 SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0842", "title": "Implement MobileNet-style INT8 quantization for ESP32-S3 inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should the MobileNetV1-0.25x model be quantized and deployed on ESP32-S3 for reliable INT8 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0843", "title": "Cortex-M4 Depthwise Separable Convolution Tiling for SIMD Latency and SRAM", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the Cortex-M4 depthwise separable layer be tiled and costed to achieve correct SIMD latency and SRAM use?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 4}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0844", "title": "Realization: deploy EfficientNet-inspired tiny CNN on Cortex-M4 within 256KB SRAM", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you design the architecture to fit within 256KB SRAM and 256KB Flash for model weights, then realize the TFLite Micro deployment?", "chain_ids": ["tinyml-chain-auto-016-06"], "chain_positions": {"tinyml-chain-auto-016-06": 3}, "chain_tiers": {"tinyml-chain-auto-016-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0845", "title": "Realization: adapt MobileNetV1-0.25x for ESP32-S3 with resolution scaling", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How must the MobileNetV1-0.25x input resolution and architectural stride pattern be adapted to satisfy the ESP32-S3 latency and SRAM constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0846", "title": "Specification: define CNN architecture constraints for Cortex-M4 vision inference", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What architecture, memory, latency, and accuracy constraints should specify a CNN for 64x64 fault detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0847", "title": "Dataset Curation: Design Training Data Pipeline for MCU Keyword Spotting", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the training data collection and curation pipeline that produces balanced training data suitable for this ultra-constrained deployment context?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0848", "title": "IMU Gesture Dataset Storage Budget for ESP32-S3 PSRAM", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you estimate storage requirements for 10K training samples and determine whether they fit in the 8MB PSRAM for on-device training?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0849", "title": "Dataset Curation: Implement Feature Extraction Pipeline for Cortex-M4 Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How should MFCC feature extraction simulate Cortex-M4 fixed-point behavior to maintain training and inference parity?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 0}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0850", "title": "Dataset Curation: Implement Data Augmentation for TinyML Acoustic Models", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How large is the stored augmented KWS dataset after applying speed and SNR noise, how does SpecAugment affect storage, and is the pipeline feasible to process?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0851", "title": "Dataset Curation: Mastery — End-to-End Data Strategy for TinyML Anomaly Detection", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design the complete data strategy for an anomaly detection model that achieves 5% false alarm rate and 90% anomaly detection rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0852", "title": "Dataset Curation: Mastery — Cross-Device Dataset Portability for TinyML Fleet", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design a single data curation pipeline that produces training data suitable for all three MCU model variants?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0853", "title": "Dataset Curation: Optimize Dataset Size vs Model Accuracy on Cortex-M4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What is the most cost-effective path from 89% to 95% keyword-spotting accuracy on Cortex-M4?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 1}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0854", "title": "Dataset Curation: Realize Dataset Storage Architecture for TinyML Training Pipeline", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify file formats, feature pre-computation, quantization pipeline, and validation set splits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0855", "title": "Dataset Curation: Realize PSRAM-Constrained Training Dataset for ESP32-S3", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you specify what fits in PSRAM, the training batch strategy, and training loop design?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 1}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0856", "title": "TinyML Training Data Constraints for Cortex-M4 Deployment", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What are the three key data constraints that differ from cloud ML training, and why does each matter for model accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0857", "title": "Dataset Curation: Specification — TinyML Dataset Quality SLA for Medical Wearable", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "training", "question": "What quantitative dataset quality SLAs are needed for a medical fall detector to meet sensitivity and false alarm targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0858", "title": "Latency Decomposition: Compare Cortex-M4 vs. Cortex-M7+Ethos-U55 for Keyword Spotting", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare inference latency and explain which dominates the power budget for always-on detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0859", "title": "Latency Decomposition: Compare ESP32-S3 vs. Cortex-M4 for Image Classification", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare inference latency accounting for the PSRAM access penalty on the ESP32-S3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0860", "title": "Latency Decomposition: Compute Per-Layer Latency Budget for MCU CNN", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Which layer is the raw compute bottleneck for a 2.1 MOP depthwise-separable CNN on a 168 MHz Cortex-M4 at 1 MAC/cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0861", "title": "Latency Decomposition: Full TinyML Pipeline Audit for Predictive Maintenance Sensor", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the predictive maintenance latency pipeline be audited, and which components leave room under the 50 ms budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0862", "title": "Latency Decomposition: Full TinyML Gesture Recognition Pipeline with Power Budget", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the ESP32-S3 gesture wristband meet its one-week battery target, and what power changes are needed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0863", "title": "Latency Decomposition: Optimize MCU Inference Latency from 200ms to Under 50ms", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which optimizations can move the 168 MHz Cortex-M4 speech model from 200 ms toward the 50 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0864", "title": "Latency Decomposition: Size and Validate TinyML Model for Real-Time ECG", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Do the ECG model's 50 KB weights, activations, and buffers fit in 256 KB SRAM on a 168 MHz MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0865", "title": "Latency Decomposition: Size End-to-End Latency for Environmental Sensor TinyML Node", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should end-to-end latency be sized for the ESP32-S3 air quality pipeline, including sensor reads and LoRa transmission?", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0866", "title": "Model Format Conversion: Compare TFLite Micro vs. TensorFlow Lite for MCU Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare TFLite Micro (MCU runtime, <20KB RAM overhead) vs. standard TFLite (GPU delegate) for each platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0867", "title": "TFLM Ethos-U55 Delegate vs Manual CMSIS-NN Offload", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 500K-parameter INT8 CNN use TFLM with Ethos-U55 delegate or manual CMSIS-NN and Ethos offload?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0868", "title": "Model Format Conversion: Recall TFLite Micro Supported Op List and Limitations", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which common neural network operations are NOT supported in TFLite Micro, and what are the workarounds?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 0}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0869", "title": "Model Format Conversion: Implement TFLite Flatbuffer Size Calculation for MCU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total model size and determine if it fits in the 512KB Flash alongside a 64KB application binary?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0870", "title": "Ethos-U55 TFLM and Vela Deployment for MobileNetV1 Person Detection", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Does the full conversion pipeline's memory layout meet a 50ms latency budget on a battery-powered device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0871", "title": "ESP32-S3 Deployment Tradeoff: TFLM LSTM vs 1D-CNN", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a 50K-parameter LSTM for ESP32-S3 stay in TFLM, use a custom engine, or be replaced by a 1D CNN?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0872", "title": "TFLite Micro Flash Footprint Optimization with Kernel Trimming and Pruning", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose and quantify optimizations to free 100KB Flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0873", "title": "Model Format Conversion: Size TFLM Model for Cortex-M4 Flash and SRAM", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you size Flash and SRAM for MobileNetV1-0.25 INT8 on a 512 KB Flash, 256 KB SRAM Cortex-M4?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 1}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0874", "title": "Model Format Conversion: Size TFLM Deployment for ESP32-S3 with PSRAM", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What latency penalty comes from placing 475 KB MobileNetV1 weights in ESP32-S3 PSRAM instead of SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0875", "title": "Model Format Conversion: Optimize INT4 Quantization for Cortex-M4 MCU Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate INT4 quantization for the first model and quantify Flash savings vs. accuracy risk?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0876", "title": "Model Format Conversion: Optimize TFLM Model to Fit Within MCU Flash Constraint", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the smallest set of Flash-saving optimizations that makes the image fit in 256KB while keeping expected accuracy loss under 2%?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0877", "title": "Model Format Conversion: Size Multi-Model TFLM Deployment on Cortex-M7+NPU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should Flash, SRAM, and activation arenas be sized for two triggered TFLM models on Cortex-M7 plus Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0878", "title": "TFLite Micro INT8 Quantization Parameters for Activations and Conv Weights", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What scale and zero-point parameters are stored for INT8 tensors, and when are per-axis weight scales used?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0879", "title": "Cortex-M4 INT8 Model Size: Flash Storage vs SRAM Tensor Arena Limit", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is 512 KB Flash not the real limit for INT8 model size on Cortex-M4, and what SRAM-based limit applies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0880", "title": "Model Size Estimation: Design Minimum-Memory Architecture for MCU Keyword Spotter", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive layer types, filter counts, activation memory profile per layer, and verify the SRAM constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0881", "title": "Model Size Estimation: Compare DS-CNN-S vs. MobileNetV1-0.25 on ESP32-S3", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which fits ESP32-S3 audio KWS better: DS-CNN-S at 0.9 MOPs or MobileNetV1-0.25 at 14.9 MOPs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0882", "title": "Model Size Estimation: Compare Quantized vs. Float Model on Cortex-M7+NPU", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you compare Flash usage, SRAM peak activation memory, inference latency, and power for a 10Hz detection rate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0883", "title": "Cortex-M4 Memory Fit Check for a 50K Parameter INT8 Model", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does a 50K-parameter INT8 model fit in 512 KB Flash and 256 KB SRAM on a Cortex-M4?", "chain_ids": ["tinyml-chain-auto-secondary-003-14"], "chain_positions": {"tinyml-chain-auto-secondary-003-14": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0884", "title": "Model Size Estimation: Master Full MCU Memory Budget for Industrial Sensor Node", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you verify complete system fits and compute inference duty cycle for 10Hz operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0885", "title": "Model Size Estimation: Master Memory-Constrained Model Selection for Medical Wearable", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model and platform choice can meet the medical arrhythmia accuracy and 5-year battery requirements, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0886", "title": "Cortex-M4 TFLM HardFault from Tensor Arena SRAM Overflow", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you diagnose a Cortex-M4 TFLM HardFault when a 180 KB arena fails at the third convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0887", "title": "Model Size Estimation: Diagnose Flash Overflow on ESP32-S3 Multi-Model Deployment", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 two-model firmware report a 2.1 MB overflow when the board has 16 MB of physical Flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0888", "title": "Model Size Estimation: Realize Parameter Count and Memory for MCU Anomaly Detector", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many bias bytes are required for the Conv1D anomaly detector, and why is that count different from the incorrect estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0889", "title": "Model Size Estimation: Realize Full System Memory for ESP32-S3 Voice Assistant", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should SRAM, PSRAM, and Flash be allocated for the ESP32-S3 voice assistant with two models and audio buffering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0890", "title": "Derive Power, Duty Cycle, and Latency Budget for a CR2032-Powered Cortex-M4 Sensor", "topic": "model-size-estimation", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you derive maximum model parameters, inference latency, duty cycle, and average power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0891", "title": "TFLM Memory Planning: Compute Activation Arena Size for a 3-Layer INT8 CNN", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What minimum activation/scratch arena is needed, excluding model weights and persistent tensors?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0892", "title": "Specify a Safety-Critical TFLM Deployment Checklist for a Medical MCU", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a production deployment checklist for a TFLite Micro model on this Cortex-M7 plus microNPU medical-device platform?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0893", "title": "TinyML TCO Recall: Cortex-M4 vs ESP32-S3 Cost Profile", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "For a 10,000-unit deployment, which platform has lower 2-year TCO?", "chain_ids": ["tinyml-chain-auto-secondary-008-10"], "chain_positions": {"tinyml-chain-auto-secondary-008-10": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0896", "title": "TinyML TCO Design: Optimized Power Budget for Wearable TinyML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can a 100 mAh LiPo meet a 7-day wearable budget with a continuous 2 mA heart-rate sensor, ECG classification every 5 minutes, and 30-minute BLE syncs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0897", "title": "TinyML TCO Design: Cortex-M7+Ethos-U55 for Industrial TinyML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Using the stated hardware, current, voltage, and electricity-rate inputs, how would you calculate the 5-year Total Cost of Ownership (TCO) and determine the most cost-effective architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0898", "title": "5-Year TCO for 10,000 ESP32-S3 Crop Monitoring Sensors", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the 5-year TCO for 10,000 ESP32-S3 (240MHz, 512KB SRAM, WiFi) crop monitoring sensors?", "chain_ids": ["tinyml-chain-auto-secondary-008-10"], "chain_positions": {"tinyml-chain-auto-secondary-008-10": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-008-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0901", "title": "TinyML TCO Diagnosis: Battery Life Mismatch Root Cause", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause and calculate the actual vs expected average current?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0902", "title": "TinyML TCO Evaluation: Cortex-M4 vs ESP32-S3 for Production Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the 2-year battery replacement TCO for 500 units?", "chain_ids": ["tinyml-chain-auto-secondary-008-10"], "chain_positions": {"tinyml-chain-auto-secondary-008-10": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0904", "title": "TinyML TCO Evaluation: Cloud-in-the-Loop vs Fully On-Device TinyML", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "When should the predictive maintenance fleet use on-device TinyML versus cloud inference, and what is the three-year TCO?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0905", "title": "TinyML TCO Fluency: Quick Cost Estimation for TinyML Deployment", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the quick estimates for sensor energy cost, ESP32-S3 cloud break-even volume, and coin-cell battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0906", "title": "Compute CR123A Battery Life for a TinyML Smoke Detector", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected battery life of this system at 3V, and does it meet a 10-year requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0908", "title": "TinyML Inference Energy and Cost Comparison", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the energy per inference (in μJ) and the cost for 1 million inferences on both platforms, and which factor is the true differentiator?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0910", "title": "ESP32-S3 vs Cloud NB-IoT TCO for TinyML Inference", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How do hardware, power, cloud inference, data transmission, and SIM costs compare for ESP32-S3 versus cloud inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0911", "title": "TinyML TCO Mastery: Full Lifecycle TinyML Product Cost Model", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What are the five-year lifecycle costs and margin for the ESP32-S3 air quality monitor product?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0912", "title": "TinyML TCO for Industrial Sensor Make Buy Cloud Decision", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Which make, buy, or cloud option minimizes five-year TCO for 50,000 vibration sensors, and why?", "chain_ids": ["tinyml-chain-auto-secondary-008-09"], "chain_positions": {"tinyml-chain-auto-secondary-008-09": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-008-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0914", "title": "TinyML TCO Optimization: Quantization Savings for TinyML Fleet", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the battery life improvement, fleet-wide annual power savings, and is the $20K quantization engineering cost justified over 3 years?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0915", "title": "TinyML TCO Optimization: Reduce TinyML Fleet Management Costs", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What automation opportunities exist for each category, and what are the quantified savings and 3-year NPV at a 5% discount rate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0919", "title": "TinyML ESP32-S3 Development Cost Breakdown", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total first-year platform development cost for the ESP32-S3 TinyML product, including tools, cloud, labels, and engineering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0920", "title": "TinyML Smart-Building Deployment Budget Under $100K", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you verify against the $100K budget constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0922", "title": "TinyML TCO Specification: Design TinyML Fleet Budget for Healthcare", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What five-year budget should a hospital plan for 200 regulated TinyML vital sign monitors, and what dominates the cost?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0923", "title": "Analyze Transformer FLOPs on Cortex-M7+Ethos-U55", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much MobileViT computation falls back to the Cortex-M7 versus Ethos-U55, and how does that affect latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0924", "title": "Analyze Chinchilla Scaling for MCU-Deployable Transformers", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the tension between Chinchilla-optimal training compute and the memory constraints forcing architectural compromises at inference?", "chain_ids": ["tinyml-chain-auto-secondary-004-20"], "chain_positions": {"tinyml-chain-auto-secondary-004-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0925", "title": "Design Speculative Decoding for Cortex-M4 Transformer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the speculation window size and fallback strategy to maximize tokens/second?", "chain_ids": ["tinyml-chain-auto-secondary-004-21"], "chain_positions": {"tinyml-chain-auto-secondary-004-21": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0926", "title": "Design Attention Approximation for 64KB KV-Cache Budget", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What attention approximation fits a 128-token transformer context into the ESP32-S3 KV-cache budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0927", "title": "Design Layer-Wise Quantization Schedule for Transformer on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a mixed-precision quantization schedule that preserves accuracy while fitting memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0928", "title": "Diagnose Transformer Inference Latency Regression on ESP32-S3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did the ESP32-S3 transformer jump from 85 ms to 340 ms when context increased from 32 to 128 tokens?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0929", "title": "Diagnose Attention Head Collapse in Quantized MCU Transformer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose why INT4 caused head collapse and propose a targeted fix?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0930", "title": "Diagnose Memory Fragmentation During Transformer Layer Execution", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 transformer with 160 KB free heap fail to allocate a 128 KB QK^T buffer, and how can it be fixed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0931", "title": "Evaluate Transformer vs CNN for Wake-Word Detection on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is a 500K-parameter transformer worth 1.4% KWS accuracy over a DS-CNN given 15 versus 69 days of battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0932", "title": "Evaluate KV Cache Compression Techniques on Ethos-U55", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which KV-cache compression gives the best accuracy-memory Pareto point for the 4-layer transformer on Cortex-M7+Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0933", "title": "Evaluate Pruning Strategies for Transformer Attention on ESP32-S3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which ESP32-S3 pruning mix gets a 140 ms transformer under 100 ms with the least accuracy loss?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0934", "title": "Flash Attention FLOPs Formula for Tiny Transformers", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What total FLOPs should be used for the tiny transformer forward pass, and what calculation pitfalls must be avoided?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0935", "title": "Recall Arithmetic Intensity Threshold for Attention on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the arithmetic intensity threshold, and is standard dot-product attention compute-bound or memory-bound at sequence length 64 with 64-dim heads?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0936", "title": "Recall Roofline Threshold for ESP32-S3 Transformer Decode", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What roofline arithmetic-intensity threshold separates memory-bound from compute-bound decode for this ESP32-S3 transformer, and where does the 500K-parameter model fall?", "chain_ids": ["tinyml-chain-auto-secondary-004-21"], "chain_positions": {"tinyml-chain-auto-secondary-004-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0937", "title": "Implement Tiled Matrix Multiply for Attention on Cortex-M7", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you determine optimal tile size and compute expected speedup over naive implementation?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0938", "title": "Achieve Mastery in Transformer Inference Optimization on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "What optimization roadmap gets the Cortex-M7+Ethos-U55 transformer from 280 ms and 8.2 mW to the targets?", "chain_ids": ["tinyml-chain-auto-secondary-004-19"], "chain_positions": {"tinyml-chain-auto-secondary-004-19": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0939", "title": "Master Flash-Aware Transformer Scheduling on ESP32-S3", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How feasible is SPI layer-swapping for a 12 MB transformer on ESP32-S3, and what prefetch and compression schedule can meet 1 s latency?", "chain_ids": ["tinyml-chain-auto-secondary-004-20"], "chain_positions": {"tinyml-chain-auto-secondary-004-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0940", "title": "Master Transformer FLOPs-to-Power Model for Battery Sizing", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you compute the active energy per inference, sleep energy between inferences, total annual energy, and required battery capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0941", "title": "Optimize Prefill Batching on Ethos-U55 for Batch Documents", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What batch size should the Ethos-U55 use for eight simultaneous 64-token prefill jobs under the 256 KB SRAM budget to maximize throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0942", "title": "Optimize Weight Sharing Across Transformer Layers on Flash-Limited MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the computational cost increase, memory impact, and expected accuracy trade-off?", "chain_ids": ["tinyml-chain-auto-secondary-004-20"], "chain_positions": {"tinyml-chain-auto-secondary-004-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0943", "title": "Recall Transformer Token Budget on TinyML Hardware", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What maximum transformer sequence length fits in a 128 KB KV cache on Cortex-M4 with the given layers, heads, and head size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0944", "title": "Specify Transformer Architecture for Sub-1mW Inference on MCU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a transformer (layers, heads, embedding dim, FFN ratio, quantization) that stays under a 0.5mW active power budget while giving a plausible path to >90% validation accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0945", "title": "Specify KV Cache Memory Map for MCU NPU", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify the complete KV cache address map, including layer, K/V, head, sequence, dimension offsets, and any 16-byte alignment padding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0946", "title": "Realize Quantization-Aware Training Pipeline for MCU Transformer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should an INT8 QAT pipeline be staged for the Cortex-M7 transformer to recover F1 within the 24 GPU-hour budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0947", "title": "Realize Streaming Transformer Inference with Circular KV Buffer", "topic": "transformer-systems-cost", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How should a circular KV buffer update 20 ms speech chunks on ESP32-S3 while maintaining a 128-token sliding window?", "chain_ids": ["tinyml-chain-auto-secondary-004-21"], "chain_positions": {"tinyml-chain-auto-secondary-004-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0948", "title": "Analyze TCO of Custom ASIC vs MCU for TinyML at Scale", "topic": "tco-cost-modeling", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze 5-year TCO for both options including warranty costs from missed anomalies?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0951", "title": "Cortex-M4 vs NPU Latency Analysis", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the theoretical compute-bound latency for both architectures that explains why the slower-clocked NPU outperforms the CPU?", "chain_ids": ["tinyml-chain-auto-secondary-012-16"], "chain_positions": {"tinyml-chain-auto-secondary-012-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0952", "title": "Adversarial Denial of Sleep Attack Analysis", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the systemic impact of this adversarial perturbation on the device's power budget and operational lifespan?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0953", "title": "Depthwise Convolution Latency Discrepancy", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the latency improvement not match the MAC reduction, and why did the SRAM footprint grow on this architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0954", "title": "NPU Utilization and Cycle Cost Analysis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you calculate the actual cycle cost for both layers and explain why the depthwise layer's compute efficiency drops so drastically on this shared-memory architecture?", "chain_ids": ["tinyml-chain-auto-secondary-004-02"], "chain_positions": {"tinyml-chain-auto-secondary-004-02": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0955", "title": "Active Learning SRAM Overflow", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does curating a dataset of low-contrast anomalies cause these specific hardware failures?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0956", "title": "DMA Overhead on Small Audio Chunks", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does DMA raise power for 32-byte I2S transfers on a 128 MHz nRF5340 instead of saving it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0957", "title": "Battery Drain Anomaly in Dual-Core Always-On Audio", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the dual-core always-on audio system drain the 200mAh coin cell in 4 days despite the application core's low 10% duty cycle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0958", "title": "Energy Impact of PSRAM Weight Offloading", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving 400 KB of weights from SRAM to PSRAM greatly increase the ESP32-S3 energy per inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0959", "title": "4-Bit Quantization Latency Regression on nRF5340", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why did 4-bit weight quantization on the nRF5340 shrink flash usage but increase inference latency and battery drain?", "chain_ids": ["tinyml-chain-auto-secondary-009-07"], "chain_positions": {"tinyml-chain-auto-secondary-009-07": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0961", "title": "Analyze Fallback Latency", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Explain why this latency overrun happens on the MCU when falling back from a 1M-MAC to a 4M-MAC CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0962", "title": "Operator Fusion Memory Tradeoff on nRF5340", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does fusing operators to reduce latency cause a significant increase in peak SRAM usage?", "chain_ids": ["tinyml-chain-auto-secondary-004-32"], "chain_positions": {"tinyml-chain-auto-secondary-004-32": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-32": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0963", "title": "The Distillation Temperature Bottleneck", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does this specific layer cause such a severe performance degradation, and what is the optimal deployment strategy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0964", "title": "Latency Decomposition of SPI Camera Pipeline on Corstone-300", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why is Corstone-300 vision latency about 40 ms when Ethos-U55 inference is only 0.04 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0965", "title": "NPU to CPU Fallback Latency Analysis", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does replacing ReLU with a zero-MAC custom activation increase total layer execution time by roughly 60x?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0966", "title": "ESP32-S3 PSRAM Bandwidth Bottleneck Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do ESP32-S3 INT8 weights in external PSRAM dominate latency despite 240 MHz vector compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0967", "title": "ESP32-S3 Quantized Model Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 INT8 TFLM model run slowly when ESP-NN vector kernels are not linked?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 1}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0968", "title": "Analyzing SRAM Overflow in Cortex-M4 Residual Block", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this memory failure occur despite the largest individual activation size being well under the 256 KB SRAM limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0969", "title": "NAS Memory Constraint Analysis on nRF5340", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does the NAS reject the nRF5340 inverted bottleneck block despite its weights fitting in Flash?", "chain_ids": ["tinyml-chain-auto-secondary-011-20"], "chain_positions": {"tinyml-chain-auto-secondary-011-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0970", "title": "Layer Fusion Latency Penalty on Cortex-M4", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this layer fusion scheduling behavior degrade execution speed on this specific hardware architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0971", "title": "OTA Rollback due to Shared SRAM Exhaustion", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Corstone-300 OTA rollback after boot when a 320 KB tensor arena and 200 KB network stack share 512 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-004-12"], "chain_positions": {"tinyml-chain-auto-secondary-004-12": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0972", "title": "NPU Offloading Energy Overhead Paradox", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this system behavior occur, and what is the total energy-per-inference for both CPU-only and NPU-offloaded execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0973", "title": "ESP32-S3 PSRAM Bottleneck Analysis", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you analyze why the execution takes 10 ms despite the high-speed CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0974", "title": "Unstructured vs Structured Pruning on Ethos-U55", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does unstructured sparsity provide no speedup on the Ethos-U55, and what is the compute cycle difference with 50% structured channel pruning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0975", "title": "Asymmetric Weight Quantization Penalty", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you explain mathematically why changing the weights to asymmetric quantization causes such a drastic latency spike on this specific hardware?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0976", "title": "Analyzing Preemption and Real-Time Misses on STM32F4", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does theoretical execution time differ from the observed 12ms, and what causes the deadline miss?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0977", "title": "Latency Analysis of On-Device Privacy Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the guardrail cause this specific latency bottleneck, and what is its minimum execution time floor due to memory fetches?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0978", "title": "ESP32-S3 PSRAM Roofline Bottleneck", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Using the roofline model, why are the vector extensions underutilized and what is the layer's operational intensity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0979", "title": "Watchdog Resets in Dual-Core Shared SRAM Inference", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze why the device experiences intermittent watchdog resets during these network events?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0980", "title": "DMA Buffer Overrun with BLE on ESP32-S3", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this happen based on the system's temporal constraints, and what is the maximum time the BLE stack can block the core before causing an overflow?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0981", "title": "Operator Scheduling for Peak SRAM Reduction", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does changing the execution order of these parallel branches alter the peak static memory requirement in the flat tensor arena?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 2}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0982", "title": "ESP32-S3 vs External NPU for Edge AI", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the system architecture, compare the compute and memory tradeoffs, and justify native ESP32-S3 execution versus an external ASIC/NPU for these two models?", "chain_ids": ["tinyml-chain-auto-secondary-012-15"], "chain_positions": {"tinyml-chain-auto-secondary-012-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0983", "title": "Secure TinyML Keyword Spotting Architecture", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design a defense strategy that masks execution patterns without violating a strict 50ms end-to-end inference latency budget?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0984", "title": "Architecting Energy-Constrained Audio Inference", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can the MCU run the 1.2 MMAC acoustic model for one-year battery life, and what duty-cycle architecture is needed?", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0985", "title": "Active Learning Data Pipeline for nRF5340 Anomaly Detection", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should nRF5340 vibration sensors curate active-learning data without continuously streaming raw signals?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 2}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0986", "title": "Energy-Aware Model Architecture for Corstone-300", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 vision wake-word model minimize SRAM access energy while using the Ethos-U55 efficiently?", "chain_ids": ["tinyml-chain-auto-secondary-012-18"], "chain_positions": {"tinyml-chain-auto-secondary-012-18": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0987", "title": "Architecting Sub-8-bit Weight Quantization for Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect an inference system utilizing extreme weight quantization to fit the model on-device while maintaining real-time execution speeds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0988", "title": "ESP32-S3 Acoustic Degradation Ladder", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a graceful degradation strategy that maintains fail-operational safety monitoring while adapting to constrained resources?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0989", "title": "AOT Compilation Pipeline Design for Cortex-M4 Edge Inference", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What AOT compilation passes are required to turn the Cortex-M4 keyword model into a static CMSIS-NN executable?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0990", "title": "Designing a Distillation Pipeline for ESP32-S3 Audio Classification", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the tradeoffs between logit matching and feature distillation while maximizing the use of the ESP32-S3's INT8 vector extensions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0991", "title": "Dual-Core Memory Allocation for Keyword Spotting", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the trade-offs of different memory allocation strategies considering the shared SRAM constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0992", "title": "Ethos-U55 Fallback Delegation and SRAM Strategy", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the model conversion and runtime delegation strategy to handle this fallback without exceeding the 512 KB SRAM limit or causing severe latency bottlenecks?", "chain_ids": ["tinyml-chain-auto-027-02"], "chain_positions": {"tinyml-chain-auto-027-02": 1}, "chain_tiers": {"tinyml-chain-auto-027-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0993", "title": "Architecting a KWS Memory Pipeline for ESP32-S3", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the ESP32-S3's internal SRAM and external PSRAM be partitioned to meet real-time latency constraints with WiFi enabled?", "chain_ids": ["tinyml-chain-auto-secondary-003-16"], "chain_positions": {"tinyml-chain-auto-secondary-003-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0994", "title": "Hardware-Aware NAS Design for ARM Cortex-M4", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How must the NAS constraint estimators evaluate peak memory and latency without physically deploying every candidate architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0995", "title": "Dual-Core Power Partitioning on nRF5340", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should DSP, inference, and BLE work be partitioned across nRF5340 cores to meet a 1 mW audio anomaly budget?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0996", "title": "Profiling CPU-NPU Memory Contention on Corstone-300", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect a non-intrusive trace and profiling strategy to definitively isolate whether the bottleneck is compute-bound on the NPU or memory-bound due to SRAM contention?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0997", "title": "Pruning Pipeline Design for nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "What pruning strategy should compress the CNN while reducing both SRAM and latency, and how does it align with the MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0998", "title": "ESP32-S3 Quantization Strategy Design", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which ESP32-S3 quantization strategy fits the predictive maintenance model in fast SRAM without sacrificing INT8 acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-0999", "title": "On-Device Bias Guardrails for Edge Audio", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an on-device OOD guardrail for respiratory audio be partitioned between Cortex-M7 and Ethos-U55 within 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1000", "title": "Real-Time Vibration Ingestion Pipeline with Ethos-U55", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the streaming ingestion (DMA buffering, feature-compute overlap, memory partitioning) to minimize dropped-frame risk under the stated SRAM and compute budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1001", "title": "NPU Shared SRAM Bus Contention Side-Channel", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this vulnerability, and how does the attack mechanism extract the model architecture?", "chain_ids": ["tinyml-chain-auto-secondary-011-16"], "chain_positions": {"tinyml-chain-auto-secondary-011-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1002", "title": "ESP32-S3 PSRAM Bottleneck Diagnosis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the root cause of this massive compute estimation discrepancy and how would you prove it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1003", "title": "Diagnosing Domain Shift in ESP32-S3 Wake-Word Datasets", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of the performance drop and explain how you would curate the dataset to resolve it?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 2}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1004", "title": "NPU Inference Latency and CPU Bottleneck", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does copying a 256x256 camera frame on the 480 MHz M7 add about 0.7 ms before Ethos-U55 inference?", "chain_ids": ["tinyml-chain-auto-secondary-010-17"], "chain_positions": {"tinyml-chain-auto-secondary-010-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1005", "title": "Diagnosing Power Regressions in Sparse Networks", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can an 80% sparser Cortex-M4 network triple active energy per inference despite utilizing CMSIS-NN SIMD instructions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1006", "title": "4-bit Quantization Fallback Diagnosis on Ethos-U55", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 4-bit Ethos-U55 keyword model peg the 480 MHz host CPU while the NPU sits idle?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1007", "title": "Distilled Student SRAM Exhaustion", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why can a smaller distilled nRF5340 model OOM while a larger pruned model runs successfully?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1008", "title": "Diagnosing Flash Wait State Stalls in CMSIS-NN", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 168 MHz STM32F4 stall when CMSIS-NN streams 800 KB of INT8 weights from flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1009", "title": "Diagnosing Latency Spikes from Unoptimized Fallbacks", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an INT8 depthwise convolution on STM32F4 take 890 ms instead of 30 ms, and how should it be fixed?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 3}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1010", "title": "NAS Latency Regression on Ethos-U55 NPU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does an Ethos-U55 NAS candidate with fewer FLOPs run 3x slower on a 512 KB Corstone-300 system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1011", "title": "Diagnosing SRAM Overflow During Concurrent Bluetooth Operations", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the nRF5340 audio model OOM during residual blocks only while BLE 5.3 is transmitting?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1012", "title": "Diagnosing Battery Drain in ESP32-S3 Wake-Word Engine", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware or software interactions are likely causing this massive power budget violation, and how would you diagnose the root cause?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1013", "title": "Diagnosing Asymmetric Quantization Overhead", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do asymmetric INT8 weights make the nRF5340 wake-word model fall from 20 ms to about 600 ms?", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1014", "title": "Floating-Point Emulation Overhead in Safety Guardrails", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the Cortex-M4 OOD guardrail push latency past 60 ms, and how should the distance computation be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1015", "title": "HardFault During High-Frequency Sensor Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Cortex-M4 with 256 KB SRAM HardFault after buffering 1 s of 50 kHz vibration data and an 80 KB model arena?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1016", "title": "Diagnosing Peak SRAM OOM on Nordic nRF5340", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Nordic nRF5340 HardFault on one convolution despite 400 KB weights fitting in flash and tensors summing under 256 KB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1017", "title": "Evaluating Adversarial Defenses on ESP32-S3 Smart Locks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate these alternatives and determine the best approach given the hardware constraints?", "chain_ids": ["tinyml-chain-auto-secondary-011-18"], "chain_positions": {"tinyml-chain-auto-secondary-011-18": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1018", "title": "Evaluating Model Architectures for Cortex-M4", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which STM32F4 model is viable at 10 Hz with 50% CPU reserved: 15M FP32 MACs or 12M INT8 MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1019", "title": "Active Learning Curation for INT8 Constrained Edge Devices", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which active learning curation pipeline do you choose and how do you implement the selection efficiently?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1020", "title": "Audio Pipeline DMA Tradeoffs on ESP32-S3", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory pipeline design maximizes inference bandwidth and CPU sleep time on the ESP32-S3?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1021", "title": "Energy-Aware Model Selection on nRF5340", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model (SRAM-bound Model A vs Flash-bound Model B) minimizes total energy per inference on the nRF5340, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1022", "title": "NPU Thermal Fallback Strategy", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which fallback design should run when the Ethos-U55 is thermally power-gated, and does it meet the 5 ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1023", "title": "Evaluating Distillation Strategies for Ethos-U55", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which distillation proposal best fits a 512 KB Cortex-M7+Ethos-U55 KWS deployment and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1024", "title": "Evaluating Pipeline Latency on Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which STM32F4 pipeline is lower latency: a soft-float 1024-point FFT with 50k MACs or integer preprocessing with 200k MACs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1025", "title": "TFLite Micro vs AOT Compilation on nRF5340", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do TFLite Micro and AOT compilation compare regarding SRAM footprint, operator support, and runtime overhead on the nRF5340?", "chain_ids": ["tinyml-chain-auto-027-04"], "chain_positions": {"tinyml-chain-auto-027-04": 1}, "chain_tiers": {"tinyml-chain-auto-027-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1026", "title": "Evaluating Memory Constraints on Shared SRAM Architectures", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Corstone-300 model is feasible: 120K params with 260 KB activations or 190K params with 110 KB activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1027", "title": "Hardware-Aware NAS Tradeoffs on ESP32-S3", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which architecture do you choose for deployment, and how do you justify the tradeoff between PSRAM latency and vectorization speedup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1028", "title": "Ethos-U55 Depth-First Scheduling for Memory Reuse", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which scheduling strategy is optimal given the shared SRAM constraints, and why?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1029", "title": "Evaluating Profiling Strategies for nRF5340 Audio Inference Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which profiling approach is better for isolating compute vs. I/O bottlenecks given the 256 KB SRAM constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1030", "title": "Structured vs Unstructured Pruning on nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which pruning strategy is better suited to fit the model onto the nRF5340 while minimizing execution time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1031", "title": "On-Device Anti-Spoofing Guardrail Evaluation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an nRF5340 smart lock run a 150 KB anti-spoofing model locally or send 16 KB audio to a phone?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1032", "title": "Audio Streaming DMA vs Interrupts on nRF5340", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an audio detector use 16 ms ADC interrupts or EasyDMA into a 32 KB ping-pong buffer, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1033", "title": "Evaluate SRAM vs Flash Tensor Arena Placement", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory planning strategy yields better system performance, and what are the architectural tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1034", "title": "ESP32-S3 Vector Extension Efficiency", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are ESP32-S3 vector and scalar MAC throughputs for 20 MMAC at 10 FPS, and how much CPU utilization does each require?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1035", "title": "Randomized Smoothing Latency on Cortex-M4", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How long does 20-pass randomized smoothing take on Cortex-M4, including overhead, and does it meet the 500 ms deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1036", "title": "Inference Latency and Energy on Nordic nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What latency and energy per inference result from a 2.5 MOP keyword model on nRF5340 at 128 MHz and 5 mA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1037", "title": "Active Learning Storage Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many 1-second uncertain audio clips can be stored on-device before needing to offload data?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 0}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1038", "title": "DMA Cycle Stealing Overhead", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What DMA bandwidth and cycle-stealing overhead does a 96x96 camera stream impose on inference when running at 168 MHz?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1039", "title": "Energy Cost of Memory vs Compute on Ethos-U55", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy do Ethos-U55 dense-layer MACs and SRAM weight fetches consume, and which dominates?", "chain_ids": ["tinyml-chain-auto-secondary-012-18"], "chain_positions": {"tinyml-chain-auto-secondary-012-18": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1040", "title": "ESP32-S3 Low-Battery Model Fallback", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the ESP32-S3 inference latencies for the primary and low-battery fallback doorbell models?", "chain_ids": ["tinyml-chain-auto-secondary-012-19"], "chain_positions": {"tinyml-chain-auto-secondary-012-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1041", "title": "BatchNorm Constant Folding Flash Savings", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the exact amount of Flash memory saved by this optimization, and what fraction of the nRF5340's total 1 MB Flash does this represent?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1042", "title": "Distilled INT8 Model Latency on ESP32-S3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Can an ESP32-S3 INT8 student model with 4.5M MACs meet a 15 ms audio-frame deadline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1043", "title": "nRF5340 End-to-End Wake Word Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total end-to-end latency from the moment the audio frame is ready for processing to the completion of the BLE transmission?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1044", "title": "SRAM Capacity Limits on Corstone-300", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does the Corstone-300 convolution fit in 512 KB SRAM without tiling, and how much memory is missing?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 0}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1045", "title": "Ethos-U55 CPU Fallback Latency Penalty", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What execution-time penalty occurs when a 4.8M-MAC Ethos-U55 layer falls back to the Cortex-M7, assuming standard CMSIS-NN throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1046", "title": "ESP32-S3 INT8 Keyword Spotting Memory Footprint", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does the INT8 keyword-spotting CNN fit entirely in ESP32-S3 internal SRAM, or must it use the slower PSRAM?", "chain_ids": ["tinyml-chain-auto-secondary-003-16"], "chain_positions": {"tinyml-chain-auto-secondary-003-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1047", "title": "Calculate Max MACs for NAS Search Space Constraints", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What absolute maximum number of INT8 MACs should a Cortex-M4 NAS enforce for a 15 ms budget at 60% utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1048", "title": "Layer Fusion for SRAM Peak Memory Reduction", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the peak memory footprint for standard sequential scheduling versus fused operator scheduling, and determine whether they fit entirely within the fast SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1049", "title": "CPU vs External NPU Bottlenecks", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is the external I2C NPU slower than local CMSIS-NN on the Cortex-M4 for this convolution layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1050", "title": "Denial-of-Sleep via Early Exit Exploitation", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does a denial-of-sleep attack on early exits change average current and coin-cell battery life?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1051", "title": "ESP32-S3 Depthwise Memory Bottleneck", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this hardware-specific bottleneck occur despite the massive reduction in mathematical operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1052", "title": "Non-linear Latency Scaling on Corstone-300", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a mere 20% increase in compute requirements result in a 7.5x increase in total inference latency on the Corstone-300?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1053", "title": "Calibration Dataset Outlier Bias", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "training", "question": "Why does this specific calibration dataset composition cause the integer-only NPU to fail on subtle inputs?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 1}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1054", "title": "DMA Buffering vs CPU Interrupts on nRF5340", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the CPU-driven approach cause overruns, and what is the CPU context-switch time saved per second by switching to DMA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1055", "title": "Coin Cell Capacity Degradation Under Inference Load", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the 5mA pulse load cause the device to prematurely brownout and fail at less than 50% of the coin cell's rated capacity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1056", "title": "Memory vs Compute Energy in ESP32-S3 Workloads", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can a 100k-MAC fully connected layer in PSRAM use more energy than a 500k-MAC convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1057", "title": "Ternary Unpacking Overhead on Cortex-M33", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does ternary packing shrink an nRF5340 KWS model from 100 KB to 25 KB but add 3.1 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1058", "title": "Fallback Model Latency Scaling Analysis", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does an 8x smaller fallback CNN reduce MCU inference latency by only about half?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1059", "title": "AOT Compiler Memory-Latency Tradeoff Analysis", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you explain the architectural tradeoffs the compiler made to achieve this memory reduction, and calculate the energy difference per inference?", "chain_ids": ["tinyml-chain-auto-secondary-004-31"], "chain_positions": {"tinyml-chain-auto-secondary-004-31": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1060", "title": "Wake-Word Pipeline Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which stages dominate the Corstone-300 wake-word pipeline latency, and why is the Ethos-U55 not the bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1061", "title": "NPU Fallback Bottleneck on Ethos-U55", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does this massive latency spike occur, and what are the underlying compute and memory constraints of this heterogeneous setup?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1062", "title": "ESP32-S3 Inference Latency Anomaly Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving 350 KB of ESP32-S3 INT8 weights from SRAM to PSRAM raise inference latency from 15 ms to 75 ms?", "chain_ids": ["tinyml-chain-auto-026-02"], "chain_positions": {"tinyml-chain-auto-026-02": 1}, "chain_tiers": {"tinyml-chain-auto-026-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1063", "title": "Analyzing SRAM OOM in Early Convolution Layers", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can the first 64x64x32 INT8 convolution OOM an STM32F4 despite weights staying in flash?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1064", "title": "Hardware-Aware NAS SRAM Bottleneck on nRF5340", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the hardware-aware NAS reject high-accuracy candidate architectures despite their small 150 KB parameter size?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1065", "title": "SRAM Optimization via Fused Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the default sequential schedule fail, and how does the fused execution order resolve the SRAM bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1066", "title": "SRAM Contention During A/B OTA Updates", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a Corstone-300 OTA update OOM when a 360 KB NPU model, 100 KB RTOS stack, and flash buffer share 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1067", "title": "Energy Analysis of DVFS on Ethos-U55", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does this non-linear energy savings occur, and what is the dynamic energy per inference for both P-states?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1068", "title": "PSRAM Bandwidth Bottleneck on ESP32-S3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does an ESP32-S3 spend 25 ms on a 1 MB INT8 model stored in 80 MHz Quad SPI PSRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1069", "title": "Unstructured Pruning Latency Regression", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does unstructured pruning cause a latency regression on the Ethos-U55 architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1070", "title": "Per-Channel Requantization Overhead Analysis", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many extra cycles per channel explain the 4.57 us per-channel quantization overhead on STM32F4, given a 168 MHz clock?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1071", "title": "Interrupt-Driven Missed Audio Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 600,000-cycle Bluetooth interrupt make a 168 MHz Cortex-M4 audio pipeline miss a 16 ms frame deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 2}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1072", "title": "Fairness Guardrail PSRAM Latency Bottleneck on ESP32", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the ESP32-S3 fairness guardrail violate the 50 ms SLA after moving weights to PSRAM?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1073", "title": "ESP32-S3 SRAM vs PSRAM Roofline Shift", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How does moving ESP32-S3 CNN weights from SRAM to PSRAM shift the roofline and reduce utilization?", "chain_ids": ["tinyml-chain-auto-secondary-013-24"], "chain_positions": {"tinyml-chain-auto-secondary-013-24": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1074", "title": "Shared SRAM Contention WDT Resets", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why does the system pass lab tests but experience WDT resets in the field during heavy BLE activity?", "chain_ids": ["tinyml-chain-auto-secondary-008-08"], "chain_positions": {"tinyml-chain-auto-secondary-008-08": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-008-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1075", "title": "PSRAM Latency in Real-Time Audio Streaming", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does placing the rolling audio DMA buffer in ESP32-S3 PSRAM cause dropped real-time frames?", "chain_ids": ["tinyml-chain-auto-secondary-013-28"], "chain_positions": {"tinyml-chain-auto-secondary-013-28": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1076", "title": "Tensor Arena Peak Overlap Analysis", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the static memory planner fail with an OOM error at the Conv2D layer, despite the largest single tensor fitting in SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1077", "title": "Evaluating ESP32-S3 Vector Extensions for Audio ML", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the tradeoffs between FP32 on the standard CPU versus INT8 on vector extensions for the ESP32-S3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1078", "title": "Side-Channel Defense Architecture for STM32F4 Audio Authentication", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the STM32F4 speaker verification model defend against timing and power side-channel extraction within memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1079", "title": "nRF5340 Dual-Core Compute Partitioning", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the compute pipeline, DSP, model inference, and BLE tasks be partitioned across the dual cores to minimize energy consumption while respecting the hardware limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1080", "title": "Active Learning Pipeline for nRF5340 Wearable", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should an nRF5340 wearable selectively buffer and transmit active-learning samples without draining the battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1081", "title": "Ping-Pong DMA Architecture for Continuous Audio Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you architect the data movement strategy using DMA to ensure zero audio samples are dropped while the CPU computes the inference, and what are the required buffer sizes and memory layout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1082", "title": "Energy-Efficient Audio Pipeline Design on Corstone-300", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which partition should win under energy modeling, and when would you reject an NPU-heavy design that spills beyond 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1083", "title": "Architecting Sub-4-bit Keyword Spotting", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate and execute a sub-4-bit extreme quantization strategy on a processor that only supports INT8 SIMD instructions?", "chain_ids": ["tinyml-chain-auto-secondary-009-08"], "chain_positions": {"tinyml-chain-auto-secondary-009-08": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-08": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1084", "title": "Architecting Graceful Degradation for ESP32-S3 Voice Commands", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a smart speaker stage SRAM and PSRAM models for WiFi loss and low-power fallback?", "chain_ids": ["tinyml-chain-auto-secondary-012-19"], "chain_positions": {"tinyml-chain-auto-secondary-012-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1085", "title": "AOT Compiler Memory Architecture for STM32F4", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should an AOT compiler fuse operators and plan memory so a 300 KB activation model fits in STM32F4 SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-004-31"], "chain_positions": {"tinyml-chain-auto-secondary-004-31": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1086", "title": "Architecting Knowledge Distillation for KWS on ESP32-S3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should an ESP32-S3 KWS student be distilled so weights and activations stay within the 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1087", "title": "Dual-Core Latency Pipeline for BLE Wake-Word", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should nRF5340 cores pipeline audio, MFCC, inference, and BLE to meet a 150 ms wake-word budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1088", "title": "Corstone-300 Memory Allocation for KWS", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 allocate the 512 KB of shared SRAM and external Flash to maximize Ethos-U55 utilization without hitting OOM errors?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 2}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1089", "title": "Corstone-300 NPU Delegation and Memory Architecture", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the conversion and delegation strategy to handle this CPU-NPU context switch while ensuring the system operates within the strict 512 KB SRAM limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1090", "title": "ESP32-S3 Audio Wake Word System Architecture", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should ESP32-S3 manage SRAM, PSRAM, cores, and audio buffers for an always-on wake-word system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1091", "title": "Hardware-Aware NAS Design for Cortex-M4", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should a Cortex-M4 NAS pipeline enforce a 50 ms latency budget and 200 KB activation limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1092", "title": "Dual-Core Memory-Aware Operator Scheduling on ESP32-S3", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the microcontroller schedule Conv2D and depthwise operators to avoid materializing a 400 KB intermediate tensor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1093", "title": "Sparse Wake-Word Design for Nordic nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you design a pruning and sparsity strategy that leverages the hardware's architecture to meet memory and power constraints without degrading accuracy?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1094", "title": "ESP32-S3 Audio Wake-Word Quantization Pipeline", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate and design the optimal quantization strategy to fit memory constraints while preserving accuracy?", "chain_ids": ["tinyml-chain-auto-secondary-004-16"], "chain_positions": {"tinyml-chain-auto-secondary-004-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1095", "title": "ESP32-S3 Real-Time Audio Anomaly Detection Pipeline", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should ESP32-S3 isolate real-time audio inference from WiFi jitter to guarantee a 10 ms deadline?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1096", "title": "On-Device Guardrails for Predictive Maintenance", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should the Corstone-300 safety guardrail be implemented as a secondary model on the Ethos-U55 NPU or as deterministic physical-bounds checks on the Cortex-M7, and why?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1097", "title": "Architecting a Vision Pipeline on Corstone-300", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should Corstone-300 partition M7 post-processing and U55 convolutions to improve roofline utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1098", "title": "Continuous Audio Ingestion Pipeline on Corstone-300", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 pipeline be architected to ingest 16 kHz audio without dropping frames while sharing 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1099", "title": "CNN Backbone Architecture for Corstone-300", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should CNN convolution blocks be architected to maximize Ethos-U55 utilization without causing OOM errors in the 512 KB shared SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1100", "title": "Solar-Powered Acoustic Event Duty Cycle Design", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What sleep and wake schedule lets a solar-powered STM32F4 acoustic detector monitor continuously within its energy budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1101", "title": "ESP32-S3 Dual-Model Tensor Arena Architecture", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you design the memory planning, operator scheduling, and arena placement between SRAM and PSRAM to meet real-time latency requirements without exceeding the memory budget?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 2}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1102", "title": "Shared SRAM Contention Side-Channel", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can an unprivileged M7 telemetry task extract model structure from Ethos-U55 SRAM contention?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1103", "title": "ESP32-S3 PSRAM Bandwidth Compute Bottleneck", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What hardware constraint is causing your compute estimation to be inaccurate, and how do you diagnose the root cause?", "chain_ids": ["tinyml-chain-auto-secondary-004-05"], "chain_positions": {"tinyml-chain-auto-secondary-004-05": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1104", "title": "ESP32-S3 Camera Sensor Domain Gap Diagnosis", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What dataset curation failure occurred, and how does the hardware architecture explain this symptom?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1105", "title": "NPU Garbage Predictions After CPU Preprocessing", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does Ethos-U55 read random features after the M7 computes MFCCs in a 16 KB DMA buffer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1106", "title": "Diagnosing Power Drain from Memory Access", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a fully connected Cortex-M4 model use more energy than a convolutional model with the same 5M MACs?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1107", "title": "Ethos-U55 W4A8 Fallback Stalls", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a W4A8 CNN on Corstone-300 run on the 480 MHz M7 instead of the Ethos-U55, and what is the hardware-level root cause?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1108", "title": "Diagnosing WDT Resets During Peak BLE Transmissions", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do nRF5340 WDT resets correlate with BLE retries, and what degradation ladder should prevent them?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1109", "title": "Diagnosing BLE Drops from Dense Distilled Models", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does a dense distilled model on nRF5340 drop BLE 5.3 connections and trigger watchdog resets during inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1110", "title": "Diagnosing Fully Connected Layer Bottlenecks on STM32F4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a CMSIS-NN fully connected layer on the STM32F4 run 4x slower than its theoretical INT8 SIMD performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1111", "title": "Diagnosing Operator Fallback in TFLite Micro Conversion", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this latency anomaly during the conversion and deployment pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1112", "title": "Diagnosing SRAM Exhaustion on nRF5340 with BLE", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which nRF5340 SRAM users were missing from the team's 180 KB tensor-arena estimate?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1113", "title": "Diagnosing NAS SRAM Constraints on Corstone-300", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the Corstone-300 NAS winner HardFault despite a 380 KB activation estimate, and what memory overhead was missed?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1114", "title": "SRAM Exhaustion in Multi-Branch CNN Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the branched nRF5340 CNN HardFault even though the largest individual activation is only 140 KB?", "chain_ids": ["tinyml-chain-auto-secondary-010-20"], "chain_positions": {"tinyml-chain-auto-secondary-010-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1115", "title": "ESP32-S3 Brownout During Concurrent Vision Inference and WiFi TX", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does ESP32-S3 vision inference plus WiFi TX trigger brownout on an 80% charged LiPo battery?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1116", "title": "Diagnosing CMSIS-NN SIMD Underutilization", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 500K-MAC 1x1 convolution take 15 ms on a 168 MHz STM32F4 despite CMSIS-NN SIMD?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1117", "title": "INT8 Per-Tensor Degradation in Depthwise Convolutions", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What quantization fundamental causes massive activation deviations in depthwise convolution layers under per-tensor INT8 PTQ?", "chain_ids": ["tinyml-chain-auto-secondary-004-17"], "chain_positions": {"tinyml-chain-auto-secondary-004-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1118", "title": "Diagnosing Bias Amplification in Compressed Audio Models", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this demographic performance disparity and explain how hardware constraints contributed to the symptom?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1119", "title": "Diagnosing Sensor Buffer Overrun During Inference", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 4 kHz accelerometer DMA ping-pong buffer corrupt windows during a 35 ms Cortex-M4 inference?", "chain_ids": ["tinyml-chain-auto-secondary-013-29"], "chain_positions": {"tinyml-chain-auto-secondary-013-29": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1120", "title": "Nordic nRF5340 SRAM Crash During Convolution", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the model HardFault during the first optimized convolution despite a 140 KB tensor arena estimate?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1121", "title": "NPU to CPU Porting Power Diagnosis", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does the nRF5340 port drain a coin cell faster than the old NPU design despite lower peak current?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1122", "title": "Partitioning Workloads on Cortex-M7 and Ethos-U55", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which Corstone-300 architecture is optimal given the shared SRAM constraints, and what are the system-level latency tradeoffs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1123", "title": "Mitigating DPA Attacks on ESP32-S3 Wake-Word Models", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which ESP32-S3 DPA defense keeps a 15M-MAC wake-word model within real-time latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1124", "title": "Real-time Compute Estimation for Sensor Data", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which INT8 model meets the 20ms STM32F4 sensor stream budget when operating at 50% MAC utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1125", "title": "On-Device Active Learning Curation for STM32F4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which on-device data selection strategy is best given the STM32F4's hardware constraints?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 2}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1126", "title": "DMA Buffer Placement vs Activation Spilling", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate both designs and identify the optimal choice for overall system throughput?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1127", "title": "Energy Tradeoffs of LUT vs On-the-Fly Computation", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which approach is more energy-efficient per operation and how it impacts the overall system power budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1128", "title": "Evaluating 4-bit vs INT8 on ESP32-S3", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate INT8 versus 4-bit weight-only quantization for latency and power, and which approach is superior for this specific architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1129", "title": "Degradation Strategy for Ethos-U55 Anomaly Detection", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What graceful degradation strategy should keep Corstone-300 anomaly detection fail-operational during NPU thermal shutdown?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1130", "title": "Evaluating Distillation vs Pruning for Ethos-U55 Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Should a Corstone-300 wake-word model use 40% unstructured pruning or a dense 350 KB distilled student to fit 512 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1131", "title": "Evaluating End-to-End Latency Tradeoffs on Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which pipeline provides better end-to-end latency, and how do the preprocessing and inference components contribute to the total time?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1132", "title": "XIP vs DMA Paging on nRF5340", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What are the latency and power trade-offs between XIP and DMA double-buffering for weights on the nRF5340?", "chain_ids": ["tinyml-chain-auto-026-04"], "chain_positions": {"tinyml-chain-auto-026-04": 1}, "chain_tiers": {"tinyml-chain-auto-026-04": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1133", "title": "Evaluating SRAM Constraints on Corstone-300", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model (A or B) should be selected, and how does the shared memory architecture influence this decision on the Corstone-300?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1134", "title": "Evaluating Hardware-Aware NAS for ESP32-S3", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which model is the better architectural choice for a battery-powered device and what are the system-level tradeoffs?", "chain_ids": ["tinyml-chain-auto-secondary-011-21"], "chain_positions": {"tinyml-chain-auto-secondary-011-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1135", "title": "Corstone-300 Operator Cascading and CPU-NPU Pipelining", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should Corstone-300 use Ethos-U55 layer cascading or sequential SRAM writes for a Conv2D-Depthwise-Dense chain?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1136", "title": "Evaluate Race-to-Sleep vs DVFS for Cortex-M4 KWS", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which power strategy uses less energy for Cortex-M4 keyword spotting, race-to-sleep or DVFS, and by how much?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1137", "title": "Evaluating Profiling Strategies for BLE Audio on nRF5340", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which profiling method is better without disrupting the BLE timing on the 64 MHz network core?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1138", "title": "Evaluating Pruning Strategies on nRF5340", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "Which option do you deploy to minimize active power draw (~5mA) and satisfy memory constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1139", "title": "Evaluating On-Device Privacy Guardrails for Audio ML", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which privacy guardrail approach fits the SRAM and power limits while enabling demographic false-positive audits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1140", "title": "nRF5340 Roofline Memory Hierarchy Tradeoffs", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate both models using a roofline analysis to determine which is better for minimizing active time and maintaining the ~5mA power constraint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1141", "title": "Dual-Core BLE Streaming Architecture Evaluation", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate which architecture is better for maximizing battery life (~5mA active current constraint) while ensuring no dropped packets during high-throughput BLE streaming?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1142", "title": "Ethos-U55 vs CPU Tensor Placement Tradeoffs", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which memory planning approach do you choose and how does it impact your SRAM peak usage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1143", "title": "PTQ vs QAT fallback on Ethos-U55", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should Corstone-300 keep sensitive layers FP32 on the M7 or use full INT8 QAT for the Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1144", "title": "Evaluating CNN Data Layouts for Cortex-M4 Real-Time Deadlines", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which model do you choose to guarantee the 5ms real-time deadline, and how does the architectural tradeoff affect worst-case execution time (WCET)?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1145", "title": "Estimating Energy Per Inference on Nordic nRF5340", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy does one nRF5340 inference consume when 12.8 million cycles run at 5 mA and 3 V?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1146", "title": "Corstone-300 NPU Latency Profiling Math", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical Ethos-U55 latency for the 1.2M-MAC wake-word model once CPU fallback is eliminated?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1147", "title": "Guardrail Latency Budget on Ethos-U55", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the maximum theoretical MAC complexity the guardrail model can support to mathematically guarantee it meets the latency deadline?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1148", "title": "Roofline Ridge Point Calculation on Ethos-U55", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the Ethos-U55 peak GOPS, SRAM roofline ridge point, and bottleneck for a 16 OP/byte layer?", "chain_ids": ["tinyml-chain-auto-secondary-013-25"], "chain_positions": {"tinyml-chain-auto-secondary-013-25": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1149", "title": "Audio Ingestion Memory and Cycle Budgeting", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What circular audio buffer size and M7 cycle budget are needed for zero-copy 250 ms stride processing on Corstone-300?", "chain_ids": ["tinyml-chain-auto-secondary-013-27"], "chain_positions": {"tinyml-chain-auto-secondary-013-27": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-27": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1150", "title": "ESP32-S3 Tensor Arena Sizing with WiFi", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the required size of the flat tensor arena and determine if it can be allocated entirely in the internal SRAM?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 0}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1151", "title": "ESP32-S3 Vector Extension Speedup", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the theoretical minimum inference latency using standard scalar instructions (1 MAC per cycle) versus the specialized INT8 vector extensions (16 MACs per cycle)?", "chain_ids": ["tinyml-chain-auto-secondary-012-15"], "chain_positions": {"tinyml-chain-auto-secondary-012-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1152", "title": "Adversarial Detection Latency on Cortex-M4", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What theoretical latency does a 336k-MAC INT8 adversarial detector add on Cortex-M4 with CMSIS-NN SIMD?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1153", "title": "Estimating Inference Latency and Energy on nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the expected latency for a single inference and the active energy consumed if the system draws ~5mA at 3.0V?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1155", "title": "2-Bit Weight Unpacking and Execution on Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total Flash footprint for these weights and the theoretical minimum compute cycles for the MAC operations?", "chain_ids": ["tinyml-chain-auto-secondary-009-08"], "chain_positions": {"tinyml-chain-auto-secondary-009-08": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1156", "title": "Ethos-U55 Compiler Fallback Latency", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How many cycles does a 10% Cortex-M7 fallback add during Ethos-U55 inference, and what share of time is CPU-bound?", "chain_ids": ["tinyml-chain-auto-027-02"], "chain_positions": {"tinyml-chain-auto-027-02": 0}, "chain_tiers": {"tinyml-chain-auto-027-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1157", "title": "ESP32-S3 Peak Memory Footprint Estimation", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the peak SRAM footprint required for this model's weights and the bottleneck layer's activations, and does it fit in internal SRAM alongside 120 KB of RTOS overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1158", "title": "Watchdog Timeout Calculation for Corstone-300", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the total execution time in milliseconds to establish the minimum safe watchdog timeout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1159", "title": "Theoretical Latency Comparison of CPU vs NPU", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the theoretical minimum CPU and Ethos-U55 execution times for a 2.4M-MAC Corstone-300 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1160", "title": "Randomized Smoothing Latency on ESP32-S3", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much latency does a five-pass randomized smoothing defense add on ESP32-S3, and does it meet a 150 ms deadline?", "chain_ids": ["tinyml-chain-auto-secondary-011-18"], "chain_positions": {"tinyml-chain-auto-secondary-011-18": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1161", "title": "MAC Reduction from Depthwise Separable Convolution", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the relative MAC reduction from replacing this fixed 3x3 convolution with a depthwise separable convolution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1162", "title": "Estimating Inference Latency for INT8 Convolution on Cortex-M4", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical minimum latency of the 32-to-64 channel INT8 convolution on Cortex-M4 running at 168 MHz?", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1163", "title": "Active Learning Buffer Capacity on STM32F4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How many complete 1.5-second 8-bit audio samples can the STM32F4 buffer in remaining flash for active learning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1164", "title": "ESP32-S3 PSRAM to SRAM DMA Streaming Latency", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the minimum latency incurred by DMA-streaming 4 MB of ESP32-S3 weights from PSRAM to SRAM at 40 MB/s?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1165", "title": "Calculating Footprint for 3-bit Quantized Weights", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the exact memory required in kilobytes (1 KB = 1024 bytes) to store these 3-bit quantized weights, assuming they are tightly packed without any padding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1166", "title": "Calculate Fallback Model MAC Budget on Cortex-M7", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "If the system requires a strict latency budget of 15ms and the Cortex-M7 achieves 0.5 MACs per cycle, what is the maximum MAC limit for the fallback model?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1167", "title": "Distillation vs Pruning Latency on Ethos-U55", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "How would you calculate the inference latency in milliseconds for unstructured pruning vs distillation to demonstrate why distillation beats pruning?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1168", "title": "Calculate End-to-End Wake Word Pipeline Latency", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the theoretical end-to-end Cortex-M4 wake-word latency including MFCC, inference, and post-processing assuming 100% MAC utilization?", "chain_ids": ["tinyml-chain-auto-secondary-003-09"], "chain_positions": {"tinyml-chain-auto-secondary-003-09": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1169", "title": "SRAM Allocation and Peak Memory Calculation on nRF5340", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much SRAM remains for the ML arena, and will this deployment fit?", "chain_ids": ["tinyml-chain-auto-026-04"], "chain_positions": {"tinyml-chain-auto-026-04": 0}, "chain_tiers": {"tinyml-chain-auto-026-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1170", "title": "TFLite Micro Memory Allocation on nRF5340", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the remaining available Flash and SRAM on the device after deploying and running the converted model?", "chain_ids": ["tinyml-chain-auto-027-04"], "chain_positions": {"tinyml-chain-auto-027-04": 0}, "chain_tiers": {"tinyml-chain-auto-027-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1171", "title": "SRAM Constraint Calculation for Ethos-U55 CNN Deployment", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum number of model parameters you can support if the entire model must reside in SRAM to meet latency targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1172", "title": "Layer Fusion SRAM Calculation", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What are the sequential and fused peak SRAM requirements for the two-layer convolution subgraph?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1173", "title": "Calculate Inference Latency from Trace Cycles", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the total latency for 1.5M MAC cycles plus 600K SRAM overhead cycles on a 128 MHz nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1174", "title": "Calculate Latency of Pruning Methods on Cortex-M4", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "training", "question": "What latency and flash footprint result from 75% unstructured pruning versus 50% structured pruning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1175", "title": "Privacy Guardrail Energy Budget Calculation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Calculate the daily energy budget (in millijoules) explicitly spent on enforcing this responsible AI requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1176", "title": "Calculate Roofline Throughput on nRF5340", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What roofline throughput can the nRF5340 achieve for a 1D convolution with 0.125 MACs per byte?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1177", "title": "Vibration Ingestion Buffer Sizing", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate the maximum sleep duration in milliseconds for the app core before it must wake up to process a filled buffer?", "chain_ids": ["tinyml-chain-auto-secondary-013-29"], "chain_positions": {"tinyml-chain-auto-secondary-013-29": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1178", "title": "Watchdog Timer Calculation for ML Inference", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the minimum WDT timeout interval (in milliseconds) required to ensure the WDT does not falsely trigger during a single inference pass, adding a 20% safety margin?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1179", "title": "Calculate Energy Per Inference on Cortex-M4", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much energy does one wake-word inference consume at 1.5 INT8 MACs per cycle and 40 mA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1180", "title": "Calculate Vision Pipeline Slack Time", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How much frame slack remains for the 30 FPS Corstone-300 object detector after M7 preprocessing and NPU inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1181", "title": "Heterogeneous Vision Architecture on ESP32-S3", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Why should the security camera use an external SPI NPU, and how do communication costs affect the design?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1182", "title": "Adversarial Defense for Wake-Word on ARM Cortex-M4", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the STM32F4 wake-word system defend against adversarial audio within a strict 50 ms latency budget?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1183", "title": "Continuous Acoustic Wake-Word Partitioning", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you partition the workload between the 128 MHz app core and 64 MHz network core to meet the 100 µA limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1184", "title": "On-Device Active Learning for Audio", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should the nRF5340 architect data buffering, uncertainty compute, and radio scheduling to strictly respect the 256 KB SRAM, 1 MB flash, and ~5mA active power constraints?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 3}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1185", "title": "Continuous DMA Ingestion and Bus Contention", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design the memory architecture and data movement strategy to guarantee zero-copy ingestion, zero data loss, and minimal memory bus contention between the DMA controller and the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1186", "title": "Solar-Powered Audio on ARM Cortex-M4", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can a solar-powered Cortex-M4 audio detector stay under a continuous 1 mW budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1187", "title": "Energy-Optimal Architecture for Corstone-300", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a novel model architecture and execution schedule that trades off computational complexity for memory locality to minimize total energy per inference?", "chain_ids": ["tinyml-chain-auto-secondary-012-18"], "chain_positions": {"tinyml-chain-auto-secondary-012-18": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1188", "title": "Ternary Weight Transformer on Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a ternary-weight KWS transformer on Cortex-M4 store and compute 2-bit weights within Flash and SRAM limits?", "chain_ids": ["tinyml-chain-auto-secondary-009-08"], "chain_positions": {"tinyml-chain-auto-secondary-009-08": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1189", "title": "Personalized Federated Anomaly Detection on nRF5340", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you architect federated learning for non-IID nRF5340 sensors within strict memory, BLE, and power budgets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1190", "title": "Asymmetric Dual-Model Degradation on ESP32-S3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should ESP32-S3 degrade from a PSRAM-streamed CNN to an SRAM-resident anomaly model under low battery or PSRAM throttling?", "chain_ids": ["tinyml-chain-auto-secondary-012-19"], "chain_positions": {"tinyml-chain-auto-secondary-012-19": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1191", "title": "ESP32-S3 SRAM-Bounded Quantization-Aware Distillation", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How do you construct the student architecture and distillation loss function to fit within 512 KB SRAM while maximizing the Xtensa cores?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1192", "title": "Dual-Core Anomaly Latency Decomposition", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should a dual-core MCU split sensing, FFT, CNN inference, IPC, and BLE transmission to meet a 40 ms anomaly alert budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1193", "title": "Ethos-U55 Memory Hierarchy Design", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Corstone-300 stream Flash-resident CNN weights and schedule SRAM to keep the Ethos-U55 utilized?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 3}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1194", "title": "Ethos-U55 Delegation and Shared SRAM Optimization", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a model conversion and runtime delegation strategy to maximize inference throughput while preventing SRAM exhaustion during CPU-NPU context switches?", "chain_ids": ["tinyml-chain-auto-027-02"], "chain_positions": {"tinyml-chain-auto-027-02": 2}, "chain_tiers": {"tinyml-chain-auto-027-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1195", "title": "ESP32-S3 Memory Hierarchy Design for Always-On Audio", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should ESP32-S3 keep the always-on audio inference critical path in SRAM while prefetching PSRAM weights?", "chain_ids": ["tinyml-chain-auto-secondary-003-16"], "chain_positions": {"tinyml-chain-auto-secondary-003-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1196", "title": "Hardware-Aware NAS Design for ARM Cortex-M4 Keyword Spotting", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should a Cortex-M4 hardware-aware NAS encode SRAM, Flash, latency, and CMSIS-NN SIMD constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1197", "title": "Dual-Core Operator Scheduling for Heterogeneous Memory", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the ESP32-S3 schedule the CNN-LSTM cascade across cores and memory tiers to fit 512 KB SRAM and 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1198", "title": "Asymmetric Dual-Core ML Power Partitioning", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system partition DMA, DSP, ML inference, BLE, and sleep states to meet a one-year fall-detection battery budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1199", "title": "NPU-CPU Bus Contention and Trace Profiling in Corstone-300", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a profiling strategy to isolate the root cause and propose an architectural modification to the model or memory layout to achieve the target latency?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1200", "title": "NPU-Aware Sparsity on Corstone-300", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "training", "question": "How should a 750 KB acoustic model be pruned to fit Corstone-300's 512 KB SRAM and preserve Ethos-U55 utilization?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1201", "title": "Mixed-Precision Audio Keyword Spotting on ESP32-S3", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you choose calibration data, assign layer precisions, and map mixed-width kernels to ESP32-S3 SIMD?", "chain_ids": ["tinyml-chain-auto-secondary-004-16"], "chain_positions": {"tinyml-chain-auto-secondary-004-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1202", "title": "Dual-Core Real-Time Inference Architecture", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should nRF5340 guarantee a 1 ms vibration inference deadline while streaming BLE alerts without causing connection drops?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 3}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1203", "title": "On-Device PII Redaction Guardrail", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Corstone-300 detect and redact spoken PII on-device while fitting DSP, CNN, and buffers in 512 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1204", "title": "Corstone-300 Roofline Optimization for Micro-Transformers", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should Corstone-300 fuse micro-transformer attention to move from memory-bound to compute-bound execution within 512 KB?", "chain_ids": ["tinyml-chain-auto-secondary-013-25"], "chain_positions": {"tinyml-chain-auto-secondary-013-25": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1205", "title": "ESP32-S3 ASIL-B Fallback Architecture", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should ESP32-S3 partition safety and ML work to satisfy an ASIL-B 50 ms fault-tolerant interval?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1206", "title": "Real-Time Vibration Ingestion on Corstone-300", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should the system use DMA and zero-copy feature computation to avoid dropped 16 kHz vibration frames?", "chain_ids": ["tinyml-chain-auto-secondary-013-27"], "chain_positions": {"tinyml-chain-auto-secondary-013-27": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1207", "title": "NPU-Aware CNN Design for Ethos-U55", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an NPU visual wake-word CNN be redesigned to maximize MAC utilization without SRAM spilling or MCU fallback?", "chain_ids": ["tinyml-chain-auto-016-08"], "chain_positions": {"tinyml-chain-auto-016-08": 2}, "chain_tiers": {"tinyml-chain-auto-016-08": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1208", "title": "Ethos-U55 Compiler Tiling for Operator Fallback", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How can a compiler synthesize tiling and pipelining between the U55 and M7 to avoid spilling a 300 KB activation to external memory?", "chain_ids": ["tinyml-chain-auto-secondary-004-33"], "chain_positions": {"tinyml-chain-auto-secondary-004-33": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1209", "title": "Fail-Safe OTA Flash Partitioning for MCU Firmware and Model Updates", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an edge MCU structure a fail-safe OTA update for firmware and model payloads within exactly 1 MB Flash?", "chain_ids": ["tinyml-chain-auto-secondary-004-10"], "chain_positions": {"tinyml-chain-auto-secondary-004-10": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1210", "title": "Cross-Hierarchy Tensor Arena for ESP32-S3", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a custom static memory planner and operator schedule that partitions the tensor arena across SRAM and PSRAM to ensure real-time execution?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 3}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1211", "title": "SIMD vs NPU for STM32F4 Inference", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should the microcontroller add an external NPU or use CMSIS-NN SIMD to meet the 20 ms CNN deadline, and why?", "chain_ids": ["tinyml-chain-auto-secondary-012-16"], "chain_positions": {"tinyml-chain-auto-secondary-012-16": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1212", "title": "Optimizing Side-Channel Defenses on nRF5340", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can you optimize this side-channel defense to maintain security while reducing compute overhead to meet power constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1213", "title": "Ethos-U55 Compute Utilization and Bottleneck Analysis", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you quantify the compute utilization bottleneck and the expected latency if the feature maps are optimized to fit entirely within SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-004-02"], "chain_positions": {"tinyml-chain-auto-secondary-004-02": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1214", "title": "Optimizing Window Annotations for NPU SRAM", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you adjust the annotation workflow to resolve this hardware bottleneck, and what is the quantified impact?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1215", "title": "Zero-Copy Audio DMA on nRF5340", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How much does zero-copy EasyDMA save in terms of SRAM and CPU latency when the nRF5340 currently copies a 16 KB audio buffer before each inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1216", "title": "ESP32-S3 Keyword Spotting Energy Bottleneck Optimization", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should ESP32-S3 reduce keyword-spotting energy when PSRAM weight reads dominate each inference?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1217", "title": "Audio Model Degradation Ladder on Cortex-M4", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the Cortex-M4 audio pipeline shed work when RTOS network activity leaves too few cycles for the primary model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1218", "title": "Distillation vs Pruning for INT8 SIMD", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the 80% sparse Cortex-M4 model miss the 20 ms deadline, and how does a dense distilled INT8 model fix it?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1219", "title": "Corstone-300 Vision Pipeline Bottleneck", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What causes the 60 ms non-NPU latency in the Corstone-300 vision pipeline, and how should preprocessing and weight fetching be optimized?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1220", "title": "ESP32-S3 Operator Fallback Optimization", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this conversion bottleneck and quantify the performance gained by modifying the model format to use a supported operator?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 2}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1221", "title": "Optimizing Peak SRAM Footprint for CNN on STM32F4", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does a 400 KB-weight STM32F4 CNN OOM at runtime, and how should the bottleneck activation layer be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1222", "title": "Hardware-Aware NAS Constraints on Dual-Core MCU", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you analyze the system-level interactions causing OOM faults and quantify the new constraints for your NAS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1223", "title": "Depthwise and Pointwise Convolution Scheduling for SRAM Optimization", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you diagnose this bottleneck and quantify the memory savings of an optimized scheduling approach on a memory-constrained MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1224", "title": "Delta OTA Optimization for Ethos-U55 NPU Models", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this storage bottleneck and quantify an optimization strategy to perform reliable, in-place OTA updates without external memory?", "chain_ids": ["tinyml-chain-auto-secondary-004-12"], "chain_positions": {"tinyml-chain-auto-secondary-004-12": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1225", "title": "SRAM Bottlenecks and Structured Pruning on ESP32-S3", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does 50% unstructured pruning of an 800 KB INT8 model fail to fit in 512 KB SRAM, and how does it affect ESP32-S3 performance?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1226", "title": "Symmetric Weights and SIMD Utilization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do asymmetric INT8 weights slow Cortex-M4 pointwise convolutions, and how should quantization be changed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1227", "title": "Optimizing OOD Safety Guardrails on ESP32-S3", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the ESP32-S3 replace a 10-component FP32 GMM OOD guardrail that exceeds the 50 ms budget?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1228", "title": "ESP32-S3 Roofline Analysis for Wake-Word", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Using roofline analysis, diagnose the compute vs. memory bottleneck and estimate the speedup from pinning weights in internal SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-013-24"], "chain_positions": {"tinyml-chain-auto-secondary-013-24": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-24": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1229", "title": "Optimizing Inference for Watchdog Deadlines", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How can CMSIS-NN SIMD keep inference and memory self-test within the 10 ms watchdog window?", "chain_ids": ["tinyml-chain-auto-secondary-008-06"], "chain_positions": {"tinyml-chain-auto-secondary-008-06": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1230", "title": "DMA Ping-Pong Buffering for Continuous Sensor Ingestion", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the ingestion bottleneck, quantify the data loss, and determine the architectural fix required?", "chain_ids": ["tinyml-chain-auto-secondary-013-28"], "chain_positions": {"tinyml-chain-auto-secondary-013-28": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-28": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1231", "title": "In-Place Tensor Arena Optimization for Residual Blocks", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you analyze this bottleneck and quantify an optimization to the memory planner to resolve it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1232", "title": "Federated Learning Memory Bottleneck on Corstone-300", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should federated learning on Corstone-300 avoid SRAM OOM during local updates and global model reception?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1233", "title": "Optimizing CNN Inference on ESP32-S3 via INT8 Lowering", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How does INT8 lowering fix the ESP32-S3 keyword CNNs PSRAM and latency bottlenecks, and what speedup results?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1234", "title": "ESP32-S3 SRAM vs PSRAM Activation Bottleneck", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the bottleneck and quantify how reallocating memory resolves the latency issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1235", "title": "Corstone-300 NPU Power Bottleneck and DVFS Optimization", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose this power bottleneck and quantify a power optimization strategy to stay under the 50mW cap without missing the 50ms latency SLA?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1236", "title": "Mitigating Shared SRAM Contention in U55", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do Ethos-U55 latencies spike to 12 ms when M7 sensor interrupts log into the shared 512 KB SRAM, and how is it resolved?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1237", "title": "MobileNet Partitioning for ESP32-S3 Memory Hierarchy", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should MobileNetV2 execution on ESP32-S3 partition weights and activations between PSRAM and SRAM to minimize latency?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1238", "title": "On-Device Hard Negative Mining for ESP32-S3", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How should ESP32-S3 perform on-device hard-negative mining without exhausting SRAM or radio power?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 3}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1239", "title": "Zero-Copy DMA Pipeline for Ethos-U55", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the Corstone-300 architecture utilize DMA to handle 60 FPS visual inference without inducing memory bus contention?", "chain_ids": ["tinyml-chain-auto-secondary-010-17"], "chain_positions": {"tinyml-chain-auto-secondary-010-17": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1240", "title": "Arithmetic Intensity and SRAM Energy Tradeoffs", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Why can depthwise separable convolutions hurt STM32F4 battery life, and how should arithmetic intensity guide layer choices?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1241", "title": "2-Bit Weight Storage for Ethos-U55 Keyword Spotting", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you evaluate 2-bit packed weights for Ethos-U55 keyword spotting without assuming native 2-bit execution?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1242", "title": "Federated Averaging Memory and Communication Sizing on ESP32-S3", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should the ESP32-S3 firmware manage memory buffers and communication schedules to process a 150K-parameter CNN update efficiently?", "chain_ids": ["tinyml-chain-auto-025-06"], "chain_positions": {"tinyml-chain-auto-025-06": 1}, "chain_tiers": {"tinyml-chain-auto-025-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1243", "title": "Graceful Degradation for Anomaly Detection on nRF5340", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design this fallback architecture and concretely size the memory and compute allocations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1244", "title": "Sizing a Distilled Keyword Spotting Model for nRF5340", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate the strict platform constraints and size the student model's parameters and peak activations concretely?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1245", "title": "ESP32-S3 Speech-to-Intent Latency Breakdown", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Evaluate the end-to-end latency by decomposing it into preprocessing, TTFT, TPOT, and network transmission?", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1246", "title": "Evaluating Flash vs SRAM Execution for Cortex-M4 Inference", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should a Cortex-M4 acoustic anomaly model read 700 KB of weights directly from Flash or page them into SRAM with DMA?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 4}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1247", "title": "TFLite Micro Operator Fallback on Cortex-M4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you handle a TFLite Micro layer that falls back from CMSIS-NN to a slow reference kernel on a constrained microcontroller?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 4}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1248", "title": "Sizing an Audio Wakeword Model for Nordic nRF5340", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you size SRAM and flash for an always-on nRF5340 wakeword detector while reserving resources for Zephyr, BLE, and audio buffers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1249", "title": "SRAM-Constrained NAS on Ethos-U55", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How do you formulate the NAS memory constraint function to ensure the discovered models can execute without SRAM overflow?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1250", "title": "Dual-Core Operator Scheduling on Nordic nRF5340", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you design the operator execution schedule to leverage the dual cores, apply layer fusion, and optimize memory reuse to fit within the strict SRAM limits?", "chain_ids": ["tinyml-chain-auto-secondary-010-20"], "chain_positions": {"tinyml-chain-auto-secondary-010-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-010-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1251", "title": "ESP32-S3 Flash Partitioning for Model OTA", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is independent firmware and model rollback possible within 8 MB flash, and what deployment tradeoff is required if it is not?", "chain_ids": ["tinyml-chain-auto-secondary-004-13"], "chain_positions": {"tinyml-chain-auto-secondary-004-13": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1252", "title": "Evaluating Race-to-Sleep vs DVFS on ESP32-S3", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an ESP32-S3 acoustic detector use 240 MHz race-to-sleep or 80 MHz underclocking to minimize energy per inference?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1253", "title": "Optimizing SIMD Utilization and Memory Stalls on Cortex-M4", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you diagnose the root cause of this poor utilization, and what architectural or memory-level changes do you make to hit your 20ms target?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1254", "title": "Structured vs Unstructured Pruning on Cortex-M4", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "training", "question": "How would you evaluate this proposal against an alternative 50% structured channel pruning approach, considering the specific architectural constraints of the target hardware?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1255", "title": "Evaluating Quantization Granularity on nRF5340", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Should an nRF5340 wake-word model use per-tensor or per-channel INT8 quantization for depthwise layers under a 50 ms budget?", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1256", "title": "Sizing an On-Device Anti-Spoofing Guardrail", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "What is the maximum allowable capacity (in memory and MACs) for the new INT8 guardrail model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1257", "title": "Roofline Analysis of Depthwise Convolutions", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Is a 3x3 depthwise convolution on a Cortex-M4 compute-bound or memory-bound under a roofline analysis, and what is its maximum throughput?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1258", "title": "Architecting ML Safety on Dual-Core nRF5340", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you architect watchdogs, MPU protection, and task scheduling so nRF5340 ML inference cannot compromise safety fault handling?", "chain_ids": ["tinyml-chain-auto-secondary-008-08"], "chain_positions": {"tinyml-chain-auto-secondary-008-08": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-008-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1259", "title": "Audio Streaming Ping-Pong Sizing on Cortex-M4", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should you size the DMA ping-pong audio buffer and remaining compute budget for a Cortex-M4 Mel spectrogram pipeline?", "chain_ids": ["tinyml-chain-auto-secondary-013-29"], "chain_positions": {"tinyml-chain-auto-secondary-013-29": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-29": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1260", "title": "Tensor Arena Optimization for BLE-Enabled nRF52840 Edge Devices", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the tradeoffs of different memory planning strategies to resolve this constraint without degrading the ~5mA active power budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1261", "title": "nRF5340 Always-On Wake Word Energy Budgeting", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How would you evaluate the required duty cycle and sleep state power constraints to make this system feasible?", "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 3}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1262", "title": "Optimizing Convolution Lowering for Cortex-M4 SIMD", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How should a graph compiler lower depthwise convolutions to exploit Cortex-M4 CMSIS-NN SIMD without memory stalls?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1263", "title": "Keyword Spotting Deadlines on ESP32-S3", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How do you partition the memory and schedule the inference to guarantee the 20ms deadline without dropping audio frames or breaking the BLE connection?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 2}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1264", "title": "Identifying Side-Channel Attacks on Edge NPUs", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What physical side-channel attack can reveal proprietary Ethos-U55 model weights through power or electromagnetic traces during inference?", "chain_ids": ["tinyml-chain-auto-secondary-011-16"], "chain_positions": {"tinyml-chain-auto-secondary-011-16": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1265", "title": "ESP32-S3 Memory Hierarchy for Model Deployment", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the specific sizes of the internal SRAM and external PSRAM on the ESP32-S3, and why does this matter for compute performance?", "chain_ids": ["tinyml-chain-auto-secondary-004-05"], "chain_positions": {"tinyml-chain-auto-secondary-004-05": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1266", "title": "Data Type Formatting for ESP32-S3 Hardware Acceleration", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What specific numeric data type should the dataset features be mapped to for optimal hardware-accelerated inference on this platform?", "chain_ids": ["tinyml-chain-auto-027-22"], "chain_positions": {"tinyml-chain-auto-027-22": 0}, "chain_tiers": {"tinyml-chain-auto-027-22": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1267", "title": "SRAM vs Compute Energy Cost", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What fundamental fact from the Horowitz energy table should you recall regarding the relative energy cost of SRAM access versus integer compute?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1268", "title": "Defining Fail-Safe vs Fail-Operational on nRF5340", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the difference between fail-safe and fail-operational behavior for an nRF5340 predictive maintenance device?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1269", "title": "Knowledge Distillation Soft Targets on nRF5340", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "What is the specific term for the temperature-scaled probability distribution produced by the teacher model that the student model attempts to match?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1270", "title": "ARM Cortex-M4 SRAM and Flash Capacities", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the exact capacities of the on-chip SRAM and Flash memory for this specific platform?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 0}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1271", "title": "Nordic nRF5340 Memory Limits Recall", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Based on the hardware specifications of the Nordic nRF5340, what are its exact SRAM and flash memory capacities, and will this model fit into SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1272", "title": "Identifying Peak NPU Throughput for Corstone-300 Architecture Search", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "To correctly set the upper bound for the search space's latency estimator, what is the configurable range of MAC operations per cycle supported by the Ethos-U55 NPU?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1273", "title": "Nordic nRF5340 Dual-Core Architecture for Operator Scheduling", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the specific clock frequencies of the application core and the network core on the Nordic nRF5340?", "chain_ids": ["tinyml-chain-auto-secondary-010-20"], "chain_positions": {"tinyml-chain-auto-secondary-010-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1274", "title": "CMOS Dynamic Power Equation Recall", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What CMOS dynamic power equation should you recall when optimizing ESP32-S3 DVFS P-states for an always-on keyword spotter?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1275", "title": "Cycle Count Profiling on ARM Cortex-M4", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the name of the standard ARM CoreSight register used to count clock cycles for precise profiling?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1276", "title": "Standard Transparency Artifacts for Edge Models", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What standard transparency artifact should document intended use, bias evaluation, and hardware limits for an edge audio model?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1277", "title": "Cortex-M4 INT8 MAC Roofline Ceiling", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What is the peak INT8 multiply-accumulate (MAC) throughput per clock cycle on this specific architecture?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1278", "title": "Continuous Audio Ingestion on ARM Cortex-M4", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific microcontroller hardware peripheral and associated memory layout strategy must you recall and implement to stream this data without overwhelming the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1279", "title": "Nordic nRF5340 Dual-Core Clock Frequency Recall", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What are the application-core and network-core clock frequencies of the Nordic nRF5340 for mapping BLE and ML workloads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1280", "title": "Ethos-U55 Supported Quantization Precisions", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which quantization precisions are natively accelerated by the Ethos-U55, and what happens to unsupported sub-4-bit operations?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1281", "title": "Identifying Latency Components on ESP32-S3", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When breaking down the total latency, what term describes the time spent computationally converting raw sensor data into the required input tensor?", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1282", "title": "CMSIS-NN Quantization Format Constraints", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific data type and quantization scheme must the model's weights and activations be converted to during the TFLite conversion process to leverage the CMSIS-NN SIMD instructions?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1283", "title": "ESP32-S3 Structured Pruning Impact", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "training", "question": "Which specific type of pruning (structured or unstructured) must you use to ensure the INT8 vector instructions actually execute fewer cycles?", "chain_ids": ["tinyml-chain-auto-secondary-005-15"], "chain_positions": {"tinyml-chain-auto-secondary-005-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-005-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1284", "title": "Asymmetric Quantization Equation for nRF5340", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "When converting weights from FP32 to INT8, what is the standard mathematical equation for asymmetric quantization that maps a real-world floating-point value to a quantized integer value?", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1285", "title": "Ethos-U55 Shared SRAM Architecture", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "What specific memory architecture feature of the Corstone-300 allows the NPU to access the 512 KB SRAM without requiring explicit host-to-device DMA transfers?", "chain_ids": ["tinyml-chain-auto-secondary-010-17"], "chain_positions": {"tinyml-chain-auto-secondary-010-17": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1286", "title": "Vibration Anomaly Detection Accelerator Selection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can a 168 MHz Cortex-M4 meet a 20 Hz, 15M-MAC vibration anomaly workload, or does the design need an external NPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1287", "title": "Secure Wake-Word Pipeline Design for nRF5340", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you specify a secure nRF5340 wake-word pipeline that defends against replay attacks within SRAM, flash, and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1288", "title": "Estimating Inference Latency on Corstone-300", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate Corstone-300 inference latency for a 5M-MAC CNN while accounting for shared-SRAM bandwidth bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1289", "title": "Dataset Specification for Constrained INT8 Acoustic Models", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you specify the curation workflow to ensure the resulting dataset yields a robust INT8-quantized model optimized for the Ethos-U55?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 2}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1290", "title": "nRF5340 DMA Pipeline Design for Audio ML", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 audio ML pipeline use EasyDMA and SRAM buffers to ingest 16 kHz audio without unnecessary CPU wakeups?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1291", "title": "ESP32-S3 Memory Hierarchy Energy Analysis", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which ESP32-S3 model placement strategy is more energy efficient: a 400 KB SRAM-resident model or a 2.5 MB PSRAM model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1292", "title": "Sub-4-bit Quantization Specification for nRF5340 Audio Model", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you specify sub-4-bit weights and INT8 activations so a 1.2M-parameter nRF5340 audio model fits flash and SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-009-07"], "chain_positions": {"tinyml-chain-auto-secondary-009-07": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1293", "title": "Design Graceful Degradation for Cortex-M4 Keyword Spotter", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Cortex-M4 keyword spotter degrade gracefully when BLE interrupts reduce the available CPU window to 15 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1294", "title": "Ethos-U55 Operator Offloading and Fallback Analysis", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you analyze the system-level implications of this compilation outcome and specify a graph optimization strategy to satisfy strict latency constraints?", "chain_ids": ["tinyml-chain-auto-secondary-004-33"], "chain_positions": {"tinyml-chain-auto-secondary-004-33": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1295", "title": "Designing a Distilled Keyword Spotter for Cortex-M4", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you design an INT8 distilled keyword-spotting student model for a Cortex-M4 with no FPU and tight SRAM limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1296", "title": "Keyword Spotting Latency Budget on Corstone-300", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose a 50 ms Corstone-300 keyword-spotting latency budget across audio capture, M7 preprocessing, and U55 inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1297", "title": "Optimizing Keyword Spotting Memory Allocation on ESP32-S3", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Where should ESP32-S3 KWS and speaker-verification weights, activations, and audio buffers reside to minimize latency and power?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1298", "title": "KWS Memory Architecture on Cortex-M4", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you partition Cortex-M4 SRAM and flash to determine the maximum feasible size of an INT8 depthwise-separable KWS model?", "chain_ids": ["tinyml-chain-auto-secondary-003-14"], "chain_positions": {"tinyml-chain-auto-secondary-003-14": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-003-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1299", "title": "Hardware-Aware NAS for Keyword Spotting on nRF5340", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should a hardware-aware NAS pipeline constrain memory, latency, and search rewards for nRF5340 keyword spotting?", "chain_ids": ["tinyml-chain-auto-secondary-011-20"], "chain_positions": {"tinyml-chain-auto-secondary-011-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-20": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1300", "title": "SRAM-Constrained Layer Fusion on Cortex-M4", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Specify a memory management architecture to ensure peak memory stays under 256 KB without modifying model weights?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1301", "title": "Continuous KWS Power Specification on Corstone-300", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Corstone-300 continuous KWS system choose DVFS, duty cycling, and SRAM placement to stay below a 1.5 mW cap while meeting a 100 ms inference latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1302", "title": "ESP32-S3 Wake-Word Profiling Specification", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you profile wake-word latency spikes to distinguish vectorization failures, PSRAM cache thrashing, and I/O preemption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1303", "title": "Designing a Pruning Strategy for ESP32-S3 SRAM Constraints", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a pruning specification that achieves this memory target while ensuring the model fully utilizes the ESP32-S3's INT8 acceleration?", "chain_ids": ["tinyml-chain-auto-secondary-005-15"], "chain_positions": {"tinyml-chain-auto-secondary-005-15": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-005-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1304", "title": "Designing Quantization for Keyword Spotting on Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What INT8 quantization specification lets a 500 KB FP32 keyword spotter avoid FPU emulation and fit memory limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1305", "title": "Acoustic Pipeline Scheduling on Dual-Core MCU", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the 128 MHz app core and 64 MHz net core, how do you partition the pipeline to guarantee this deadline while respecting a ~5mA current limit?", "chain_ids": ["tinyml-chain-auto-024-12"], "chain_positions": {"tinyml-chain-auto-024-12": 1}, "chain_tiers": {"tinyml-chain-auto-024-12": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1306", "title": "On-Device Privacy Guardrails for ESP32-S3 Voice Assistant", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a specification for an on-device, fail-safe guardrail system that prevents unauthorized audio transmission while maintaining acceptable latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1307", "title": "ESP32-S3 Roofline Memory Architecture Specification", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the system stage the 800 KB of weights so the arithmetic intensity remains compute-bound and satisfies the 50 ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1308", "title": "ESP32-S3 Continuous Audio Ingestion and Feature Extraction Design", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 audio pipeline buffer I2S data and compute MFCCs while preserving 300 KB SRAM for inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1309", "title": "Designing a Static Tensor Arena for Wake-Word CNN", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the static memory allocation be designed to fit the tensor arena in SRAM and handle model parameters?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 1}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1310", "title": "Acoustic Pest Detection Duty Cycling on Corstone-300", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a duty-cycling strategy for the M7, Ethos-U55, and shared SRAM to achieve a 2-year battery life on a 220 mAh coin cell?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1311", "title": "Diagnosing CNN Performance on ESP32-S3 with Roofline Analysis", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should roofline analysis diagnose whether an ESP32-S3 keyword-spotting CNN with PSRAM-resident weights is memory-bound or compute-bound?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1312", "title": "Roofline Analysis on Nordic nRF5340: Optimizing TinyML Inference", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you use roofline analysis on an nRF5340 to classify a 100K-MAC, 60 KB-access KWS model as compute-bound or memory-bound?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1313", "title": "Roofline Analysis for TinyML on ESP32-S3", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should roofline analysis account for ESP32-S3 SRAM, PSRAM, and accelerator utilization when diagnosing slow TinyML inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1314", "title": "Roofline Analysis for TinyML: Cortex-M7 + Ethos-U55 Performance Evaluation", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should roofline analysis predict whether a Cortex-M7 plus Ethos-U55 CNN workload is compute-bound or memory-bound, and predict NPU utilization?", "chain_ids": ["tinyml-chain-auto-secondary-013-25"], "chain_positions": {"tinyml-chain-auto-secondary-013-25": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-25": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1315", "title": "Roofline Analysis for TinyML on ARM Cortex-M4 STM32F4", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a roofline model tailored to an FPU-less STM32F4 diagnose a slow 1D CNN and guide optimization?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1316", "title": "Diagnosing TinyML Performance on Nordic nRF5340: Accelerator Trade-offs", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose nRF5340 keyword-spotting latency, brownouts, and battery drain when choosing CPU optimization or acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1317", "title": "TinyML Anomaly Detection: Cortex-M4 CPU vs. Custom ASIC for 8-bit CNN", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Should a Cortex-M4 industrial anomaly detector run an INT8 CNN on CPU or offload critical layers to a custom ASIC to meet a 5ms budget?", "chain_ids": ["tinyml-chain-auto-secondary-012-16"], "chain_positions": {"tinyml-chain-auto-secondary-012-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1318", "title": "ESP32-S3 TinyML Accelerator Feasibility", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Does a 200K-MAC KWS model on ESP32-S3 need a custom accelerator, or can optimized CPU inference meet 50 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1319", "title": "TinyML Accelerator Trade-offs for Edge Anomaly Detection", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which platform best meets the 10 Hz anomaly detection requirement while optimizing for long battery life?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1320", "title": "Accelerator Selection for Edge ML on ESP32-S3", "topic": "accelerator-comparison", "competency_area": "compute", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you compare the ESP32-S3 CPU, a minimal NPU, and a custom ASIC for small image classification under power limits?", "chain_ids": ["tinyml-chain-auto-secondary-012-15"], "chain_positions": {"tinyml-chain-auto-secondary-012-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-012-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1321", "title": "Optimizing a TinyML Model for Nordic nRF5340 Constraints", "topic": "mcu-compute-constraints", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you optimize an nRF5340 anomaly model to meet 100 Hz, sub-10 ms inference without an FPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1322", "title": "TinyML Compute Estimation for Nordic nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which compute and memory metrics must be estimated before deploying, and what is the theoretical latency floor for a 500k MAC model?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1323", "title": "Diagnosing High Inference Cost on nRF5340 for Keyword Spotting", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose why nRF5340 keyword-spotting inference takes 250 ms and battery life is far below target?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1324", "title": "TinyML Resource Estimation for Keyword Spotting on nRF5340", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate MACs, peak RAM, latency, and quantization choices for nRF5340 keyword spotting?", "chain_ids": ["tinyml-chain-auto-secondary-004-01"], "chain_positions": {"tinyml-chain-auto-secondary-004-01": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1325", "title": "Edge AI Power Budget: Optimizing Gesture Recognition on ESP32-S3", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate cloud training cost and year-long ESP32-S3 inference energy for a wearable gesture model?", "chain_ids": ["tinyml-chain-auto-secondary-004-05"], "chain_positions": {"tinyml-chain-auto-secondary-004-05": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-05": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1326", "title": "ESP32-S3 Memory Hierarchy: Capacity, Bandwidth, Latency Tradeoffs", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the memory tiers and how do their capacity, bandwidth, and latency affect TinyML placement?", "chain_ids": ["tinyml-chain-auto-026-02"], "chain_positions": {"tinyml-chain-auto-026-02": 0}, "chain_tiers": {"tinyml-chain-auto-026-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1327", "title": "TinyML Memory Latency Analysis on STM32F4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many CPU cycles remain for non-memory operations in the STM32F4 CNN critical section after SRAM access costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1328", "title": "ESP32-S3 Memory Bottleneck Diagnosis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the likely cause of the inference latency spikes when CPU utilization drops and PSRAM access requests increase?", "chain_ids": ["tinyml-chain-auto-026-02"], "chain_positions": {"tinyml-chain-auto-026-02": 2}, "chain_tiers": {"tinyml-chain-auto-026-02": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1329", "title": "TinyML Model Deployment: nRF5340 Memory Hierarchy Optimization", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should weights, input buffers, and activations be placed on nRF5340 to meet a 10 Hz anomaly detection budget?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1330", "title": "TinyML Memory Hierarchy Optimization on STM32F4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which approach would you recommend for maximizing inference throughput while maintaining acceptable power consumption for a real-time application?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1331", "title": "CNN Feature Map Optimization: SRAM vs. External DRAM on Cortex-M7/Ethos-U55", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose this memory bottleneck and quantify the potential performance improvement by moving it to SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1332", "title": "TinyML Memory Hierarchy Optimization on Nordic nRF5340", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 place 100 KB CNN weights, 50 KB activations, buffers, and code across flash, SRAM, and registers?", "chain_ids": ["tinyml-chain-auto-026-04"], "chain_positions": {"tinyml-chain-auto-026-04": 2}, "chain_tiers": {"tinyml-chain-auto-026-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1333", "title": "Optimizing CNN Memory Footprint on ARM Cortex-M4 for TinyML", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you analyze tensor arena layout and operator scheduling so a Cortex-M4 CNN fits within 256 KB SRAM?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1334", "title": "Tensor Arena Planning for TinyML on Nordic nRF5340", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you plan a flat tensor arena and flash versus SRAM placement for a CNN on the Nordic nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1335", "title": "ESP32-S3 TinyML Inference Latency Due to Data Movement", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the most likely root cause(s) for the increased latency and propose a solution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1336", "title": "Optimizing Data Movement for Real-time TinyML on Cortex-M7 + Ethos-U55", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you use DMA-capable buffers, zero-copy, cache maintenance, and MPU placement to avoid CPU-bound data movement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1337", "title": "Optimizing Sensor Data Ingress on nRF5340 for TinyML Inference", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 move 16 KB sensor inputs from a high-speed peripheral to a TinyML model with minimal CPU overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1338", "title": "Optimizing Sensor Data Ingest for TinyML on ESP32-S3", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should ESP32-S3 DMA and buffering replace CPU copies for 200 KB sensor transfers in a 50 ms anomaly pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1339", "title": "Optimizing DMA and Data Movement for Real-time ML Inference on Cortex-M7 + Ethos-U55", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should DMA and ping-pong buffering raise a Cortex-M7 plus Ethos-U55 image pipeline from 10 FPS to 20 FPS?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1340", "title": "Optimizing Data Movement for TinyML Inference on Nordic nRF5340", "topic": "dma-data-movement", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an nRF5340 pipeline I2S audio windows into an ML model and BLE output with minimal CPU copying?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1341", "title": "Latency Decomposition for Keyword Spotting on ARM Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you decompose, measure, and analyze latency on this resource-constrained embedded platform to meet the target?", "chain_ids": ["tinyml-chain-auto-secondary-003-09"], "chain_positions": {"tinyml-chain-auto-secondary-003-09": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-003-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1342", "title": "ESP32-S3 Edge Latency Decomposition for Real-time Object Detection", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose the 700 ms latency and identify the components blocking the 300 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1343", "title": "Optimizing Real-time KWS Latency on Cortex-M7 + Ethos-U55", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose and optimize end-to-end KWS latency on a Cortex-M7 plus Ethos-U55 system under a 100 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1344", "title": "Latency Decomposition on Nordic nRF5340 for TinyML Inference", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you measure and optimize TTFT, TPOT, BLE, preprocessing, and postprocessing latency on nRF5340 anomaly detection?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1345", "title": "TinyML Keyword Spotting Latency Decomposition on ARM Cortex-M4", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you decompose Cortex-M4 KWS latency across audio acquisition, preprocessing, inference, and postprocessing under 150 ms?", "chain_ids": ["tinyml-chain-auto-secondary-003-09"], "chain_positions": {"tinyml-chain-auto-secondary-003-09": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-09": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1346", "title": "Latency Decomposition for TinyML Keyword Spotting on Cortex-M7 + Ethos-U55", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you compare Model A and Model B using first-result, per-inference, transfer, preprocessing, NPU, and postprocessing latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1347", "title": "Real-time Anomaly Detection Latency on Nordic nRF5340", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can an nRF5340 wearable exceed a 250 ms anomaly-alert budget even when TinyML inference itself is fast?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1348", "title": "Latency Decomposition and Optimization for Real-time TinyML on ESP32-S3", "topic": "latency-decomposition", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you profile and optimize ESP32-S3 KWS latency from audio capture through MQTT alert under a 100 ms target?", "chain_ids": ["tinyml-chain-auto-secondary-003-12"], "chain_positions": {"tinyml-chain-auto-secondary-003-12": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-003-12": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1349", "title": "Real-Time TinyML Inference on Nordic nRF5340", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the architectural design guarantee a 33 ms real-time frame budget and prevent ANR timeouts on the nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1350", "title": "Real-Time TinyML Inference on ESP32-S3: Diagnosing Latency and Jank", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the system architecture be restructured to systematically diagnose and mitigate latency jitter to guarantee the 33ms real-time deadline?", "chain_ids": ["tinyml-chain-auto-024-10"], "chain_positions": {"tinyml-chain-auto-024-10": 5}, "chain_tiers": {"tinyml-chain-auto-024-10": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1351", "title": "Real-Time Gesture Recognition on Nordic nRF5340: Meeting 30ms Latency Budgets", "topic": "real-time-deadlines", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If the initial WCET analysis reveals that the inference frequently exceeds 30ms, what architectural, algorithmic, or software-level changes would you propose to bring it within budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1352", "title": "Optimizing ML Inference Latency on Cortex-M7/Ethos-U55 for Real-time Edge Applications", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically profile and identify the primary bottlenecks contributing to this 30 ms latency?", "chain_ids": ["tinyml-chain-auto-secondary-012-21"], "chain_positions": {"tinyml-chain-auto-secondary-012-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1353", "title": "Optimizing TinyML Inference Latency on STM32F4", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What specific profiling techniques and tools would you employ, considering the lack of an FPU and limited SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-012-22"], "chain_positions": {"tinyml-chain-auto-secondary-012-22": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-22": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1354", "title": "Optimizing Real-time ML Inference on ESP32-S3: Diagnosing Latency Bottlenecks", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose whether ESP32-S3 TinyML latency spikes come from compute, memory, or I/O bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1355", "title": "TinyML Latency Bottleneck on Cortex-M7+Ethos-U55", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you identify the specific latency bottlenecks (compute, memory, or I/O) and determine NPU utilization?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1356", "title": "Optimizing TinyML Latency on FPU-less Microcontrollers", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which bottlenecks keep an STM32F4 anomaly model at 150 ms, and which candidate architecture can meet the 80 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1357", "title": "Optimizing TinyML Latency on ESP32-S3", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L2", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which ESP-IDF profiling metrics and tools should identify why ESP32-S3 KWS latency exceeds 150 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1358", "title": "Optimizing Latency on Cortex-M7 for TinyML Inference", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you find and quantify the bottleneck preventing the Ethos-U55 audio classifier from meeting its 5 ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1359", "title": "Optimizing Latency for a TinyML Model on nRF5340", "topic": "profiling-bottleneck-analysis", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Describe a detailed profiling methodology to identify whether the bottleneck is compute, memory access, or I/O, utilizing standard embedded development tools?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1360", "title": "INT8 Quantization Accuracy Drop on ARM Cortex-M4 STM32F4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why might INT8 post-training quantization cause an unacceptable accuracy drop for a Cortex-M4 image classifier?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1361", "title": "Quantization Drift on Nordic nRF5340 for Keyword Spotting", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of this accuracy drop and propose a solution, considering the nRF5340's constraints?", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1362", "title": "Optimizing a Quantized TinyML Model for Resource-Constrained ARM Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you quantize a Cortex-M4 voice activity CNN to preserve accuracy while avoiding FPU emulation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1363", "title": "Quantization Strategy for Edge Deployment on Cortex-M7 + Ethos-U55", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you improve INT8 quantization for a Cortex-M7 plus Ethos-U55 model with a 5% PTQ accuracy drop?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1364", "title": "Extreme Sub-4-bit Quantization for Keyword Spotting on ESP32-S3", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What strategies would you employ to recover the lost accuracy while maintaining the 2-bit memory footprint and respecting the ESP32-S3's hardware limitations?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1365", "title": "Extreme Quantization for TinyML Transformer on Cortex-M7/Ethos-U55", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you approach quantizing this model to sub-4-bit precision to fit within the 512KB SRAM limit and meet a 10ms inference latency requirement?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1366", "title": "Extreme Quantization for Keyword Spotting", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the total flash memory footprint of the quantized model, and assuming linear scaling, what is the new SRAM requirement for activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1367", "title": "Diagnosing Sub-4-bit Quantization Accuracy Degradation on STM32F4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the root cause of sub-4-bit quantization accuracy degradation and memory faults on a standard microcontroller?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1368", "title": "Sub-4-bit Quantization on Cortex-M7 + Ethos-U55 for TinyML", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you deploy a sub-4-bit keyword-spotting CNN on Cortex-M7 plus Ethos-U55 when an INT8 model exceeds SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-009-06"], "chain_positions": {"tinyml-chain-auto-secondary-009-06": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-06": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1369", "title": "Extreme Quantization on nRF5340: Architecture Evaluation", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you systematically evaluate and recommend the best model for the nRF5340, considering performance, resource constraints, and accuracy recovery feasibility?", "chain_ids": ["tinyml-chain-auto-secondary-009-07"], "chain_positions": {"tinyml-chain-auto-secondary-009-07": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1370", "title": "Extreme Quantization on Resource-Constrained ARM Cortex-M4", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you size and evaluate ternary or 2-bit quantization for a Cortex-M4 gesture CNN targeting sub-10 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1371", "title": "Optimizing a TinyML LLM with Sub-4-bit Quantization on ESP32-S3", "topic": "extreme-quantization", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose bottlenecks, propose a sub-4-bit strategy, quantify gains, and recover accuracy for a TinyML LLM on ESP32-S3?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1372", "title": "Nordic nRF5340 Power Mode Analysis", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should DVFS P-states and power caps be managed on such a device to optimize energy efficiency, considering its constrained resources and variable workload?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1373", "title": "Energy-Efficient TinyML Inference on ARM Cortex-M4 STM32F4", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you analyze STM32F4 energy per inference using CMOS power, DVFS concepts, fixed-point CNNs, and memory access costs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1374", "title": "Optimizing TinyML Power Budget on ESP32-S3 for Battery-Powered Edge Devices", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you approach the power budgeting of this system, considering the processor's DVFS P-states, memory access energy (SRAM vs. PSRAM), and TDP limitations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1375", "title": "TinyML Device: Unexpected Idle Power Drain", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the most likely root causes for this accelerated battery drain, and how would you systematically diagnose and rectify the problem?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1376", "title": "Optimizing ML Inference Power on Nordic nRF5340", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you use clock scaling, sleep states, peripheral gating, and energy-per-inference measurements to keep nRF5340 anomaly detection within a 100 uW power budget?", "chain_ids": ["tinyml-chain-auto-secondary-009-10"], "chain_positions": {"tinyml-chain-auto-secondary-009-10": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-009-10": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1377", "title": "Optimizing an Ultra-Low-Power Edge ML Model on Nordic nRF5340 for Energy Efficiency", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you design a two-year coin-cell nRF5340 gesture model using Horowitz energy principles and energy-aware operators?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1378", "title": "Diagnosing Unexpected Energy Drain in TinyML on STM32F4", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why can memory access dominate energy drain for an 8-bit CNN on an STM32F4 despite a low MAC count?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1379", "title": "Optimizing CNN Energy on ARM Cortex-M4 STM32F4", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you cut CNN energy on an STM32F4 when SRAM access costs 10x a MAC and external flash costs 100x?", "chain_ids": ["tinyml-chain-auto-secondary-012-17"], "chain_positions": {"tinyml-chain-auto-secondary-012-17": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1380", "title": "Energy-Aware Inference Optimization on ESP32-S3 for TinyML", "topic": "energy-per-operation", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you quantify potential energy savings using the Horowitz energy principles for memory access vs. arithmetic operations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1382", "title": "Optimizing ESP32-S3 for Low-Power Acoustic Anomaly Detection with Solar and Coin Cell Power", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can an ESP32-S3 acoustic monitor meet a 1-year lifetime with 10-minute inference and CR2032 plus 2 V solar backup, and how should the power budget and duty-cycling strategy be designed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1383", "title": "Optimizing CNN for Real-time Anomaly Detection on Nordic nRF5340", "topic": "cnn-efficient-design", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you redesign a CNN with depthwise separable convolutions and inverted residuals to meet latency and power limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1384", "title": "ESP32-S3 TinyML Deployment: Memory Feasibility Analysis for a Keyword Spotting CNN", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you assess whether a 300K-parameter INT8 KWS CNN with 200 KB activations fits on ESP32-S3 memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1385", "title": "TinyML Model Deployment on ARM Cortex-M4: Memory Constraint Analysis", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can a Cortex-M4 KWS model with 180 KB weights and 90 KB activations fit and run effectively within 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1386", "title": "TinyML Model Feasibility on Nordic nRF5340 for Gesture Recognition", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Based on the nRF5340's specifications, which model is more feasible to deploy given real-time memory and compute constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1387", "title": "TinyML Model Deployment on ARM Cortex-M4: Memory Constraints", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you estimate weights, activations, buffers, and bottlenecks for a 100K-parameter INT8 CNN on an STM32F4?", "chain_ids": ["tinyml-chain-auto-secondary-003-14"], "chain_positions": {"tinyml-chain-auto-secondary-003-14": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-003-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1388", "title": "ESP32-S3 Model Deployment: Memory Footprint and Feasibility Analysis for a TinyML Vision Task", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you assess the deployment feasibility of a 500K-parameter CNN and diagnose potential memory bottlenecks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1389", "title": "Estimating INT8 CNN Size, SRAM Footprint, and Ethos-U55 Latency on Cortex-M7", "topic": "model-size-estimation", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you estimate parameter count, INT8 memory footprint, and Cortex-M7 plus Ethos-U55 inference latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1390", "title": "Hardware-Aware NAS on ESP32-S3: Performance & Memory Trade-offs", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why would the hardware-aware NAS converge to a 400 KB SRAM, 50 MFLOP model with 150 ms latency?", "chain_ids": ["tinyml-chain-auto-secondary-011-21"], "chain_positions": {"tinyml-chain-auto-secondary-011-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-011-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1391", "title": "Hardware-Aware NAS for TinyML on Cortex-M7 + Ethos-U55", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should a hardware-aware NAS pipeline find an Ethos-U55 anomaly model under 80 ms latency and 350 KB memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1392", "title": "Hardware-aware NAS Deployment on Resource-Constrained Microcontrollers", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root causes of the NAS model deployment failure considering STM32F4 hardware limitations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1393", "title": "Hardware-Aware NAS on ESP32-S3 for TinyML Vision", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should an ESP32-S3 hardware-aware NAS constrain its search space and construct its reward function to ensure the model satisfies the SRAM and latency targets?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1394", "title": "Hardware-Aware NAS on Ethos-U55: Memory and Latency Constraints", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What concerns remain for an Ethos-U55 NAS candidate with 250K INT8 parameters and 500M MACs despite fitting the parameter memory?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1395", "title": "Hardware-Aware NAS on Nordic nRF5340: Memory and Latency Evaluation", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L5", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Which nRF5340 NAS candidate is better at 10 Hz: 500K FLOP FP32 with 160 KB SRAM or 1.2M FLOP INT8 with 70 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-011-20"], "chain_positions": {"tinyml-chain-auto-secondary-011-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1396", "title": "Hardware-Aware NAS for Vision on ARM Cortex-M4", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should hardware-aware NAS for an FPU-less Cortex-M4 constrain search space, strategy, and feedback for vision models?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1397", "title": "Hardware-aware NAS for Edge Deployment on Cortex-M7/Ethos-U55", "topic": "neural-architecture-search", "competency_area": "architecture", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should Cortex-M7 and Ethos-U55 NAS incorporate SRAM, FLOPs, latency, and memory estimates before deployment?", "chain_ids": ["tinyml-chain-auto-secondary-011-19"], "chain_positions": {"tinyml-chain-auto-secondary-011-19": 4}, "chain_tiers": {"tinyml-chain-auto-secondary-011-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1398", "title": "Pruning Strategies for nRF5340 Model Deployment", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What is the difference between structured and unstructured pruning, and which is better for nRF5340 edge deployment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1399", "title": "Pruning Strategies for Resource-Constrained Microcontrollers (ARM Cortex-M4)", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does structured pruning beat unstructured sparsity for a 1.5M-param CNN on a no-FPU STM32F4 with 256 KB SRAM?", "chain_ids": ["tinyml-chain-auto-secondary-005-16"], "chain_positions": {"tinyml-chain-auto-secondary-005-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-005-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1400", "title": "Optimizing Keyword Spotting CNN for ESP32-S3 with Pruning & Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you design a comprehensive strategy to optimize this model using pruning and sparsity techniques to fit within memory constraints and meet performance targets?", "chain_ids": ["tinyml-chain-auto-secondary-005-15"], "chain_positions": {"tinyml-chain-auto-secondary-005-15": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-005-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1401", "title": "Ethos-U55 Pruning Impact: Latency & Memory for Object Detection", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How do you calculate latency reduction and parameter storage saved from 2:4 pruning?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1402", "title": "On-Device Model Pruning for STM32F4 with Sparsity and Performance Tradeoffs", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the sparsity pattern be approached to maximize inference speedup on a Cortex-M4 without an FPU, and what are the trade-offs?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1403", "title": "Optimizing a CNN for Edge Deployment via Structured Pruning on Ethos-U55", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you prune an INT8 CNN for Ethos-U55 so it fits 512 KB SRAM while preserving dense NPU acceleration?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1404", "title": "Optimizing TinyML Inference on ARM Cortex-M4 via Pruning and Sparsity", "topic": "pruning-sparsity", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should pruning and sparsity optimize a Cortex-M4 KWS model from 400 KB and 300 ms to 200 KB and 100 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1405", "title": "Diagnosing Distillation Accuracy Drift and CPU Bottlenecks on ESP32-S3", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should the engineering team systematically diagnose the distillation accuracy drop and CPU bottleneck on the ESP32-S3 architecture?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1406", "title": "TinyML Knowledge Distillation for nRF5340 Anomaly Detection", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you distill a 5 MB teacher anomaly model into a student under 500 KB flash, 150 KB SRAM, and 50 ms?", "chain_ids": ["tinyml-chain-auto-secondary-014-08"], "chain_positions": {"tinyml-chain-auto-secondary-014-08": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-014-08": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1407", "title": "Knowledge Distillation vs. Pruning on ARM Cortex-M4 for Real-time Anomaly Detection", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you compare knowledge distillation and pruning for a real-time Cortex-M4 anomaly detector with no FPU?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1408", "title": "Optimizing Knowledge Distillation for Edge Deployment", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you train an MCU student model with knowledge distillation, quantization, and hardware-aware profiling, and why choose it over pruning?", "chain_ids": ["tinyml-chain-auto-secondary-014-07"], "chain_positions": {"tinyml-chain-auto-secondary-014-07": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-014-07": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1409", "title": "Optimizing TinyML Image Classification with Knowledge Distillation on Cortex-M7 + Ethos-U55", "topic": "knowledge-distillation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose bottlenecks in the pruned model and propose a knowledge distillation strategy to recover accuracy on the Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1410", "title": "Optimizing a TinyML Model for Cortex-M7 with Ethos-U55: Graph Compilation Strategy", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a graph compiler convert and partition a TensorFlow CNN for Cortex-M7 plus Ethos-U55 real-time KWS?", "chain_ids": ["tinyml-chain-auto-secondary-004-33"], "chain_positions": {"tinyml-chain-auto-secondary-004-33": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-33": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1411", "title": "Optimizing a TinyML Model for ESP32-S3 with AOT Compilation", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should AOT compilation, operator lowering, and constant folding help an ESP32-S3 KWS CNN fit SRAM and meet 50 ms?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1412", "title": "nRF5340 KWS Inference Optimization: Graph Compilation Challenge", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you diagnose the 500 ms baseline inference bottleneck and propose an AOT graph compilation strategy to achieve a 50 ms target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1413", "title": "TinyML Graph Optimization for FPU-less ARM Cortex-M4", "topic": "graph-compilation", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should AOT graph compilation optimize an INT8 Cortex-M4 audio CNN from 50 ms toward sub-10 ms latency?", "chain_ids": ["tinyml-chain-auto-secondary-004-31"], "chain_positions": {"tinyml-chain-auto-secondary-004-31": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-31": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1414", "title": "Ethos-U55 Scheduling for Memory and Parallelism", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Ethos-U55 operator scheduling use memory reuse, parallelism, and layer fusion to reduce SRAM footprint and latency?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1415", "title": "Optimizing Operator Scheduling for TinyML on Nordic nRF5340", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why are memory reuse, selective layer fusion, and realistic parallelism crucial when scheduling CNN operators on edge microcontrollers?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1416", "title": "TinyML Operator Scheduling on ARM Cortex-M4 for Ultra-Low Latency Inference", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you schedule, fuse, and reuse buffers for an INT8 Cortex-M4 CNN to minimize latency and SRAM footprint?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1417", "title": "Diagnosing Inefficient Operator Scheduling on Cortex-M7/Ethos-U55 for TinyML", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does default scheduling leave the Ethos-U55 at 20% utilization with 40% of time spent on SRAM transfers?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1418", "title": "TinyML Operator Fusion on Cortex-M4 for Memory Constrained CNN", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should convolution, ReLU, and pooling be fused and scheduled to avoid intermediate activation OOM on a Cortex-M4 VAD CNN?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1419", "title": "Optimizing Operator Scheduling for Memory and Performance on ESP32-S3", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which ESP32-S3 operator schedule best balances 512 KB SRAM reuse, dual-core parallelism, and layer fusion?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1420", "title": "Ethos-U55 Operator Scheduling for Memory and Performance", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Cortex-M7 and Ethos-U55 operator scheduling minimize peak SRAM, maximize NPU utilization, and fuse CNN layers?", "chain_ids": ["tinyml-chain-auto-secondary-010-19"], "chain_positions": {"tinyml-chain-auto-secondary-010-19": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-010-19": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1421", "title": "TinyML Model Deployment on nRF5340: Optimizing Operator Schedule for Memory and Performance", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an operator scheduling strategy that allows the model to fit within the nRF5340's 256 KB SRAM constraints and meet the 100 ms latency target?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1422", "title": "CNN Operator Scheduling on ARM Cortex-M4 with Limited SRAM", "topic": "operator-scheduling", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an FPU-less Cortex-M4 schedule CNN operators with memory reuse, fixed-point arithmetic, and fusion under 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1423", "title": "Designing Robust OTA Updates for TinyML on Resource-Constrained STM32F4", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an STM32F4 OTA design use A/B partitions and rollback for an 80 KB TinyML model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1424", "title": "ESP32-S3 A/B OTA Flash Partition Sizing for ML Inference", "topic": "ota-firmware-updates", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you size ESP32-S3 flash partitions for robust A/B OTA updates when the ML firmware image is 3 MB?", "chain_ids": ["tinyml-chain-auto-secondary-004-13"], "chain_positions": {"tinyml-chain-auto-secondary-004-13": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-13": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1427", "title": "Model Conversion and Deployment on ESP32-S3 with TFLite Micro", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What are the critical steps and memory placement strategies for converting a TensorFlow CNN to TFLite Micro on the ESP32-S3?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 0}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1428", "title": "Resolving TFLite Micro Operator Gaps on Nordic nRF5340", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you convert, optimize, and deploy a TFLite TinyML model on nRF5340 while handling operator coverage gaps?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1429", "title": "Optimizing and Deploying a Quantized Model on ARM Cortex-M4 STM32F4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you convert a PyTorch or ONNX KWS CNN to TFLite Micro for an FPU-less STM32F4 and fit 256 KB SRAM while maintaining acceptable latency?", "chain_ids": ["tinyml-chain-auto-027-01"], "chain_positions": {"tinyml-chain-auto-027-01": 2}, "chain_tiers": {"tinyml-chain-auto-027-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1430", "title": "Optimizing Semantic Segmentation Deployment on Cortex-M7 + Ethos-U55 with Model Format Conversion", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you convert a PyTorch segmentation model for Cortex-M7 plus Ethos-U55 when unsupported operators threaten latency and memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1431", "title": "Optimizing ML Model Deployment on Nordic nRF5340 for Edge Inference", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you approach the conversion, operator handling, and memory optimization to successfully deploy this model within the nRF5340's constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1432", "title": "TinyML Model Deployment and Conversion Challenges on Resource-Constrained ARM Cortex-M4", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you compare PyTorch and TensorFlow KWS models for STM32F4 deployment, conversion, and operator support?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1433", "title": "Optimizing ONNX Model Conversion for ESP32-S3 Edge Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you handle ONNX-to-TFLite Micro operator gaps and memory placement for ESP32-S3 image classification?", "chain_ids": ["tinyml-chain-auto-027-03"], "chain_positions": {"tinyml-chain-auto-027-03": 3}, "chain_tiers": {"tinyml-chain-auto-027-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1434", "title": "TinyML Model Conversion and Operator Gap Management for Nordic nRF5340 Deployment", "topic": "model-format-conversion", "competency_area": "deployment", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Outline a strategy for converting this 2.5 MB ONNX model to fit the nRF5340, detailing format conversion, operator gaps, and memory sizing?", "chain_ids": ["tinyml-chain-auto-027-04"], "chain_positions": {"tinyml-chain-auto-027-04": 2}, "chain_tiers": {"tinyml-chain-auto-027-04": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1435", "title": "Graceful Degradation for Real-time Anomaly Detection on Constrained TinyML Device", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Cortex-M4 anomaly detector degrade gracefully under noise, congestion, or battery stress while preserving utility?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1436", "title": "ESP32-S3 Anomaly Detection with Graceful Degradation", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a system that employs graceful degradation to maintain operational status even under severe resource constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1437", "title": "TinyML Graceful Degradation for Predictive Maintenance on Cortex-M7/Ethos-U55", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose this performance degradation and design a graceful degradation ladder to maintain fail-operational status?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1438", "title": "Graceful Anomaly Detection on Constrained TinyML", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a graceful degradation strategy, considering degradation ladders, model fallbacks, fail-safe vs. fail-operational modes, and potential quality-of-service shedding?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1439", "title": "Graceful Degradation Architectures for TinyML on ESP32-S3", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which graceful degradation architecture is more suitable for critical ESP32-S3 anomaly detection, and why?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1440", "title": "Graceful Degradation for Real-time TinyML Anomaly Detection", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should a Cortex-M7 plus Ethos-U55 anomaly detector use degradation ladders, fail-safe states, and QoS shedding under overload?", "chain_ids": ["tinyml-chain-auto-secondary-012-20"], "chain_positions": {"tinyml-chain-auto-secondary-012-20": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-012-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1441", "title": "TinyML Graceful Degradation on STM32F4 for Real-time Anomaly Detection", "topic": "graceful-degradation", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an STM32F4 safety-critical anomaly detector degrade when sensor noise or data spikes break inference latency thresholds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1442", "title": "ASIL-B TinyML Determinism on ARM Cortex-M4 STM32F4", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why do ASIL-B determinism, watchdogs, self-tests, and limited Cortex-M4 resources require WCET, interrupt, and memory discipline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1443", "title": "ESP32-S3 Watchdog and ML Integrity Evidence for ASIL B", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design ESP32-S3 watchdog and ML integrity checks to support an ISO 26262 ASIL B safety case?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1444", "title": "Functional Safety for TinyML on Resource-Constrained Automotive ECUs", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you ensure ISO 26262 ASIL-B functional safety for a Cortex-M4 brake-wear TinyML model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1445", "title": "ESP32-S3 TinyML Functional Safety & Determinism Optimization for ISO 26262", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose and reduce ESP32-S3 TinyML latency spikes that jeopardize ASIL-B deterministic execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1446", "title": "ASIL-C ML Safety on Cortex-M7 + Ethos-U55 for Automotive", "topic": "safety-certification", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Given the strict requirements of ISO 26262 ASIL-C, how would you architect the system to ensure functional safety, particularly concerning the ML inference pipeline?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1447", "title": "Hardware Security for TinyML Model Integrity on Nordic nRF5340", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which hardware security feature protects TinyML firmware and model integrity against tampering or extraction?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1448", "title": "Secure TinyML Anomaly Detection on ESP32-S3 Against Adversarial Attacks", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 anomaly detector defend against model tampering, side-channel leakage, and poisoned OTA model updates?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1449", "title": "SRAM Budgeting for On-Device Adversarial Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many additional 4096-element INT8 feature vectors per class fit in remaining SRAM for an on-device adversarial detector?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1450", "title": "Adversarial Attack on ESP32-S3 TinyML Classifier", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you analyze and defend an ESP32-S3 gesture classifier against adversarial sensor perturbations within resource limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1451", "title": "Adversarial Robustness Evaluation on Cortex-M7 Plus NPU for TinyML", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design an evaluation framework to compare resilience against common adversarial attacks given platform constraints?", "chain_ids": ["tinyml-chain-auto-secondary-011-16"], "chain_positions": {"tinyml-chain-auto-secondary-011-16": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-16": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1452", "title": "Optimizing Adversarial Defense on Resource-Constrained TinyML", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should the system implement low-overhead adversarial defenses without breaking real-time latency deadlines on non-FPU hardware?", "chain_ids": ["tinyml-chain-auto-secondary-011-17"], "chain_positions": {"tinyml-chain-auto-secondary-011-17": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-011-17": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1453", "title": "Adversarial Robustness in ESP32-S3 Anomaly Detection", "topic": "adversarial-robustness", "competency_area": "reliability", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 anomaly detector mitigate adversarial sensor readings while balancing SRAM, PSRAM, and CPU overhead?", "chain_ids": ["tinyml-chain-auto-secondary-011-18"], "chain_positions": {"tinyml-chain-auto-secondary-011-18": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-011-18": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1454", "title": "Mitigating Dataset Bias in TinyML Gesture Recognition on Cortex-M7/Ethos-U55", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "What data selection or annotation workflow helps mitigate dataset bias for an Ethos-U55 gesture model under TinyML constraints?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 0}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1455", "title": "Optimizing TinyML Data Labeling for nRF5340 Resource Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "Why does the current active learning pipeline fail under BLE and memory limits, and how should data selection be optimized?", "chain_ids": ["tinyml-chain-auto-027-19"], "chain_positions": {"tinyml-chain-auto-027-19": 1}, "chain_tiers": {"tinyml-chain-auto-027-19": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1456", "title": "TinyML Anomaly Detection: Data Curation for ARM Cortex-M4", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you design a device-aware data curation and labeling pipeline for an FPU-less Cortex-M4 anomaly detector?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1458", "title": "Diagnosing Field Performance Degradation Due to Dataset Bias in TinyML", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose the root cause of this performance degradation from a dataset curation and labeling perspective, specifically considering the tinyML platform constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1459", "title": "nRF5340 TinyML Anomaly Detection Data Curation Pipeline", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should an nRF5340 data curation pipeline select, annotate, and transfer high-value anomaly samples under 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1460", "title": "TinyML Dataset Curation: Annotation Budget & SRAM Constraints", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How many 1-second audio snippets fit a 100-hour labeling budget, and how many 2 KB feature vectors could theoretically fit in the STM32F4's 256 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1461", "title": "Edge AI Keyword Spotting: Active Learning for Constrained Dataset Curation", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should active learning select rare-keyword audio samples for an Ethos-U55 KWS model while managing bias and SRAM limits?", "chain_ids": ["tinyml-chain-auto-027-21"], "chain_positions": {"tinyml-chain-auto-027-21": 3}, "chain_tiers": {"tinyml-chain-auto-027-21": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1462", "title": "Optimizing TinyML Data Curation on nRF5340 for Active Learning", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How would you diagnose and optimize this on-device data curation and transmission bottleneck to accelerate model improvement, and how would you quantify the impact of your proposed solution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1463", "title": "TinyML Anomaly Detection: Data Curation for Constrained Devices", "topic": "dataset-curation", "competency_area": "data", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "training", "question": "How should you curate and label vibration data so a Cortex-M4 anomaly model is representative, unbiased, and deployable?", "chain_ids": ["tinyml-chain-auto-027-20"], "chain_positions": {"tinyml-chain-auto-027-20": 3}, "chain_tiers": {"tinyml-chain-auto-027-20": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1465", "title": "TinyML Stream Processing on STM32F4: Anomaly Detection Bottlenecks", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does moving from FFT to a small CNN break real-time STM32F4 vibration ingestion despite only 6 KB/s raw data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1466", "title": "Real-time RMS Feature Extraction on Cortex-M7 for TinyML Streaming", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L2", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How many Cortex-M7 cycles per incoming 10 kHz sample are available for ingestion and sliding-window management?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1467", "title": "TinyML Streaming Anomaly on nRF5340", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the most likely root cause of the data drops and CPU spikes before inference on the nRF5340?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1469", "title": "Real-Time Anomaly Detection on TinyML: Cortex-M7 + Ethos-U55 Architecture Evaluation", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should Cortex-M7 plus Ethos-U55 streaming architectures compare feature extraction versus raw-data NPU inference?", "chain_ids": ["tinyml-chain-auto-secondary-013-27"], "chain_positions": {"tinyml-chain-auto-secondary-013-27": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-27": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1470", "title": "Real-time Sensor Data Ingestion Optimization on Cortex-M4 TinyML", "topic": "streaming-ingestion", "competency_area": "data", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should you diagnose and eliminate Cortex-M4 sensor data loss in a fixed-point streaming feature pipeline?", "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1471", "title": "Federated Learning on ESP32-S3: Convergence and Memory Challenges", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Why does ESP32-S3 federated learning of a 2 MB model suffer slow convergence and OOM errors, given its hardware constraints?", "chain_ids": ["tinyml-chain-auto-025-06"], "chain_positions": {"tinyml-chain-auto-025-06": 0}, "chain_tiers": {"tinyml-chain-auto-025-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1472", "title": "Federated TinyML for Anomaly Detection on Cortex-M7/Ethos-U55", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you design a federated architecture that minimizes communication overhead, handles non-IID data, and fits within 512 KB SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1473", "title": "Federated Learning on Resource-Constrained BLE Devices", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Outline a concrete architecture and strategy to implement federated averaging given the strict 64KB RAM and BLE communication constraints?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1474", "title": "Optimizing Federated Averaging on Resource-Constrained Edge Devices", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should federated averaging on STM32F4 sensors reduce communication cost and on-device training energy for non-IID data?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1475", "title": "Federated Learning on ESP32-S3: Scaling a TinyML Model for Non-IID Data", "topic": "federated-learning", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should ESP32-S3 federated averaging handle non-IID data and LoRaWAN limits across 10,000 TinyML sensors?", "chain_ids": ["tinyml-chain-auto-025-06"], "chain_positions": {"tinyml-chain-auto-025-06": 2}, "chain_tiers": {"tinyml-chain-auto-025-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1476", "title": "Essential Responsible AI Documentation for ESP32-S3 TinyML Deployment", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "inference", "question": "Which core piece of information, typically found in a model card or impact assessment, is absolutely essential to document for this system, even with limited on-device storage?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1477", "title": "Responsible AI on Constrained Edge Devices: Model Card & Guardrail Implementation", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should responsible AI practices and on-device guardrails be implemented for an Ethos-U55 industrial anomaly detector?", "chain_ids": ["tinyml-chain-auto-secondary-013-02"], "chain_positions": {"tinyml-chain-auto-secondary-013-02": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-02": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1478", "title": "Responsible AI for TinyML: Edge System Governance on Cortex-M7 + Ethos-U55", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How would you address potential biases, ensure transparency, and handle model lifecycle management within this resource-limited environment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1479", "title": "TinyML Guardrails for Safety-Critical Systems on nRF5340", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What estimated resources would a heuristic guardrail consume, and what metrics must be added to the model card?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1480", "title": "Responsible AI Architecture Evaluation on Constrained TinyML Devices", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Which architecture better serves responsible AI on an FPU-less Cortex-M4 anomaly detector: a quantized CNN or an interpretable tree ensemble?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1481", "title": "Responsible AI for Edge Safety: ESP32-S3 Anomaly Detection", "topic": "responsible-ai", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How should model cards, impact assessments, red-teaming, and guardrails fit an ESP32-S3 safety-critical anomaly detector during the realization phase?", "chain_ids": ["tinyml-chain-auto-secondary-013-01"], "chain_positions": {"tinyml-chain-auto-secondary-013-01": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-013-01": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1484", "title": "Power Budget Allocation on Cortex-M7+Ethos-U55 IoT Node", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Can the current duty cycle meet the 7-day operational target?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1489", "title": "Roofline Feasibility Check for MobileNetV2 on Cortex-M4", "topic": "roofline-analysis", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Can MobileNetV2 fit on an STM32F4 with 1 MB flash and 256 KB SRAM, and what latency remains if you shrink it to a 0.35x MobileNetV2 variant and, if needed, reduce the input to 96x96?", "chain_ids": ["tinyml-chain-auto-secondary-013-26"], "chain_positions": {"tinyml-chain-auto-secondary-013-26": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-013-26": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1490", "title": "Lightweight Attention Approximation on ESP32-S3", "topic": "attention-scaling", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Does 4-head self-attention with seq_len 50 and d_model 64 fit in 512 KB SRAM on an ESP32-S3, and what alternative would you propose?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1491", "title": "Binary and Ternary Quantization Feasibility on Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Do binary or ternary weights provide a better tradeoff for a 100K-parameter model on a Cortex-M4 considering compute latency and accuracy risks?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1492", "title": "PSRAM vs SRAM Streaming Cost for ESP32-S3 Weight Loads", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "What total streaming time and bandwidth penalty should you expect when ESP32-S3 streams 500 KB of INT8 weights from PSRAM vs internal SRAM, and how does PSRAM access latency affect it?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1494", "title": "Duty Cycling Strategy for Battery Life on Cortex-M4 Wearable", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "How would you calculate battery life and propose a duty cycling strategy for 7-day operation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1496", "title": "Huffman Encoding for Weight Compression on Cortex-M4", "topic": "pruning-sparsity", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "training", "question": "For 200K INT8 weights, estimate compression using bucket-label Huffman codes plus exact value bits and the decode cost?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1498", "title": "ESP32-S3 NAS Search Space Under SRAM Flash and Latency Limits", "topic": "pruning-sparsity", "competency_area": "architecture", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "training", "question": "How would you design an ESP32-S3 NAS search space under 100 KB SRAM, 500 KB flash, 80% accuracy, and 50 ms latency?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1499", "title": "TinyML System Specification for Predictive Maintenance Sensor Node", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you specify a predictive maintenance system for industrial motors using a sensor node with Cortex-M7+Ethos-U55?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1500", "title": "MCU Sleep Mode Strategy for Always-On Wake Word Detection", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "What is the average power of an ESP32-S3 two-stage wake-word detector with 10 fps prefilter and 5% full-model triggers?", "chain_ids": ["tinyml-chain-auto-secondary-009-09"], "chain_positions": {"tinyml-chain-auto-secondary-009-09": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-009-09": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1501", "title": "INT4 Weight Quantization Feasibility for Cortex-M4 Without DSP", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is INT4 worthwhile for a 200 KB INT8 model on a Cortex-M4 without DSP when the flash budget is 64 KB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1502", "title": "SRAM Tiling Strategy for Large Conv Layer on Cortex-M7+Ethos-U55", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Is your colleague correct that the 28x28x96 Ethos-U55 CNN layer needs tiling within 512 KiB SRAM, and how would you tile a larger 56x56x96 input?", "chain_ids": ["tinyml-chain-auto-026-03"], "chain_positions": {"tinyml-chain-auto-026-03": 1}, "chain_tiers": {"tinyml-chain-auto-026-03": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1503", "title": "MCU Power State Machine for Multi-Model ML Pipeline", "topic": "power-budgeting", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can an ESP32-S3 three-stage ML pipeline meet 1-year life on a 2000 mAh LiPo with a 5 mW always-on preprocessor?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1517", "title": "Tensor Arena Sizing for MobileNetV2 on Cortex-M7", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "inference", "question": "Why is estimating a 2.1 MB SRAM requirement mathematically flawed, and what is the true arena size to fit a 96x96 INT8 MobileNetV2 within 512 KB of SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1518", "title": "Flash vs SRAM Weight Placement Strategy", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "What is the performance penalty, and which layers should you prioritize copying to SRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1519", "title": "Tensor Lifetime Analysis for U-Net on MCU", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "How do you reduce peak memory below 512 KB without changing the model architecture?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 3}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1520", "title": "Scratch Buffer Allocation for Convolution", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "inference", "question": "For a 3x3 conv with 32 input channels processing a 24x24 feature map, how large is the im2col scratch buffer, and how does this affect your tensor arena budget?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1521", "title": "Arena Planning for Multi-Model Deployment", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "design", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How should an ESP32-S3 share arenas for sequential VAD, KWS, and IC models needing 15 KB, 85 KB, and 120 KB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1522", "title": "Memory-Optimal Operator Execution Order", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "Which execution order (A→B→C→D or A→B→D→C) has lower peak arena usage on a 512 KB Cortex-M7, and what is the general principle?", "chain_ids": ["tinyml-chain-auto-026-16"], "chain_positions": {"tinyml-chain-auto-026-16": 4}, "chain_tiers": {"tinyml-chain-auto-026-16": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1523", "title": "Peak SRAM Optimization via Channel Splitting", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "inference", "question": "How does channel splitting (computing 64 channels at a time) reduce the peak?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1524", "title": "PSRAM Spilling Strategy for ESP32-S3", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "optimization", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "If you place the entire arena in PSRAM, how much slower is inference, and what hybrid SRAM/PSRAM placement minimizes the slowdown?", "chain_ids": ["tinyml-chain-auto-026-17"], "chain_positions": {"tinyml-chain-auto-026-17": 1}, "chain_tiers": {"tinyml-chain-auto-026-17": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1525", "title": "TFLite Micro Arena Budgeting Under a 310 KB SRAM Limit", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "create", "status": "published", "phase": "inference", "question": "Can all 4 models fit under a fixed 310 KB tensor arena, and what changes are required if not?", "chain_ids": ["tinyml-chain-auto-026-18"], "chain_positions": {"tinyml-chain-auto-026-18": 2}, "chain_tiers": {"tinyml-chain-auto-026-18": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1526", "title": "ESP32-S3 Ping-Pong DMA Audio Buffering for KWS", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "inference", "question": "Can smaller ping-pong DMA buffers cut ESP32-S3 audio capture SRAM by 50% for 1-second, 16 kHz KWS windows?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1527", "title": "Compile-Time Arena Verification", "topic": "tensor-arena-planning", "competency_area": "memory", "track": "tinyml", "level": "L6+", "zone": "mastery", "bloom_level": "create", "status": "published", "phase": "inference", "question": "How would you design a compile-time verification system that catches arena overflows before flashing firmware?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1541", "title": "BLE Bandwidth Embedding Constraint", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design the required embedding compression ratio to ensure a maximum 10ms transmission latency per embedding?", "visual": {"kind": "svg", "path": "tinyml-1541.svg", "alt": "Fanout diagram showing multiple Cortex-M4 sensor nodes transmitting data to a single central hub via BLE.", "caption": "Sensor Constellation Topology"}, "chain_ids": ["tinyml-chain-auto-secondary-017-50"], "chain_positions": {"tinyml-chain-auto-secondary-017-50": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-017-50": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1542", "title": "Cortex-M4 Energy-Harvesting Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose an optimal checkpointing interval that minimizes total training overhead while bounding maximum lost progress (RPO) to 1 minute?", "visual": {"kind": "svg", "path": "tinyml-1542.svg", "alt": "Timeline showing training progress interrupted by power failures, with checkpoints saving state periodically.", "caption": "Training Progress and Rollback"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1543", "title": "Hailo-8 Wake-up Energy Drain", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Diagnose the power drain discrepancy by calculating the actual continuous average power consumption causing the 2-day battery death?", "visual": {"kind": "svg", "path": "tinyml-1543.svg", "alt": "Timeline showing sleep and wake power spikes, highlighting a raised sleep baseline power.", "caption": "Elevated Sleep Power Baseline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1545", "title": "Hailo-8 PCIe Frame Ingestion", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you optimize the host-to-accelerator pipeline to prevent PCIe bottlenecking, computing raw bandwidth before and after INT8 cast?", "visual": {"kind": "svg", "path": "tinyml-1545.svg", "alt": "Diagram showing 4 camera streams merging into a host CPU, crossing a PCIe link to the Hailo-8 accelerator.", "caption": "PCIe Gen 3 x2 Bottleneck"}, "chain_ids": ["tinyml-chain-auto-secondary-017-51"], "chain_positions": {"tinyml-chain-auto-secondary-017-51": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-51": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1546", "title": "Cortex-M4 Audio Buffer Overflow", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the time elapsed from an empty buffer until the system drops its first audio packet?", "visual": {"kind": "svg", "path": "tinyml-1546.svg", "alt": "Linear growth chart showing the buffer filling up steadily over 5 seconds until it hits the 100-packet cap.", "caption": "Deterministic Buffer Growth"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1547", "title": "Decentralized Federate Tree Aggregation", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the theoretical lower-bound latency for a single bottom-up aggregation phase from leaf to root?", "visual": {"kind": "svg", "path": "tinyml-1547.svg", "alt": "A binary tree diagram showing 16 nodes with data flowing upwards from the leaves to a single root node.", "caption": "Binary Spanning Tree (Depth 4)"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1551", "title": "Cortex-M4 Vibration FFT Queue", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What pending-window FFT queue size, excluding the active FFT, prevents loss under end-of-window arrivals?", "visual": {"kind": "svg", "path": "tinyml-1551.svg", "alt": "Chart showing queue building up to 1 during the 5 second burst, then draining to 0 during silence.", "caption": "Burst Queue Accumulation"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1560", "title": "Raw ECG BLE Stream Viability", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the bandwidth requirements and determine if raw streaming is viable or if edge inference is strictly required?", "visual": {"kind": "svg", "path": "tinyml-1560.svg", "alt": "Data flow from patch to phone showing a bottleneck.", "caption": "BLE Bandwidth Bottleneck."}, "chain_ids": ["tinyml-chain-auto-secondary-017-50"], "chain_positions": {"tinyml-chain-auto-secondary-017-50": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-50": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1562", "title": "Hailo-8 PCIe Gen3 x1 Video Bandwidth", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the realistic per-direction bandwidth and ingress utilization of a PCIe Gen3 x1 link streaming 1080p 60fps video?", "chain_ids": ["tinyml-chain-auto-secondary-017-51"], "chain_positions": {"tinyml-chain-auto-secondary-017-51": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-51": "secondary"}, "validated": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1563", "title": "M4 Boot Energy Dominance", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the average power consumption over a 10-minute window when duty-cycling?", "visual": {"kind": "svg", "path": "tinyml-1563.svg", "alt": "Timeline showing 2s at 15mA, 1s at 20mA, and 597s at 0mA.", "caption": "Boot vs Inference Energy profile."}, "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 1}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1567", "title": "Flash-to-SRAM Paging Latency", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply memory-hierarchy principles to calculate the total data transfer time for paging a 160 KB model layer-by-layer?", "visual": {"kind": "svg", "path": "tinyml-1567.svg", "alt": "Bar chart comparing Flash read bandwidth with SRAM write bandwidth.", "caption": "Memory bus bottleneck."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1593", "title": "Cortex-M4 Sensor Queue Sizing", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the optimal queue depth and drop policy to handle a Poisson arrival rate of 50 Hz given a model execution time of 15 ms, to maintain 99% availability without overflowing a 2 KB SRAM buffer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1594", "title": "MCU Audio Compute Bounding", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "For the hearing-aid Cortex-M4, how would you compare the CNN and RNN keyword-spotters by total MACs, cycles, and CPU utilization over one 1-second audio window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1597", "title": "Cascaded Wake-Up Architecture", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Propose a cascaded wake-up duty-cycling architecture using a 1uW passive PIR sensor to trigger a fast 2ms INT8 cascade model before waking up the main 50ms classification model?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1600", "title": "RTOS Preemption for Inference", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does replacing the polling loop with an RTOS task scheduler utilizing preemption fix the missing data issue?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1602", "title": "Depthwise Conv Cycle Cost on Cortex-M4", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "Implement an analytical model to estimate the cycle count of a 3x3 depthwise convolution output pixel with 16 channels, considering MAC instructions and SRAM load/store overheads?", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1605", "title": "Task-Based FRAM Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What task-based checkpointing rule can write registers to FRAM before the final fully-connected layer, including the boundary condition and the recovery procedure on brownout?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1606", "title": "A/B Bank Model OTA Updates", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "implement", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "How does an A/B bank flash layout impact available program space and wear-leveling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1607", "title": "Evaluate minimal queue capacity for transient TinyML sensor bursts", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Assuming the 150 burst events are evenly spaced across the 1-second interval, what is the minimum pending-event queue capacity required to ensure zero dropped events?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1608", "title": "Identify TinyML memory regions for static weights and dynamic activations", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Identify the correct memory regions where the weights and activation buffers must be placed to fit this model on the device?", "chain_ids": ["tinyml-chain-auto-secondary-016-05"], "chain_positions": {"tinyml-chain-auto-secondary-016-05": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-016-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1613", "title": "Calculate energy efficiency between slow and fast microcontroller clock modes", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate which MCU configuration consumes less total energy per inference?", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1622", "title": "Pipelining Compute and SPI Transfer Time", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total time to process and transmit 4 chunks if compute takes 10ms per chunk and transmission takes 5ms per chunk?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1626", "title": "Constrained Memory Allocation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How must the model's memory be laid out to successfully execute without out-of-memory errors?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1627", "title": "SRAM-Constrained Depthwise Execution", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can a computational schedule be designed to execute this layer entirely on-chip without requiring external DRAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1628", "title": "Peripheral DMA Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What standard hardware peripheral lets the CPU compute neural network layers while sensor data transfers occur in the background?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1629", "title": "Cortex-M4 Duty Cycle Budget", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum active duty cycle percentage required to achieve a 100uW average power budget?", "visual": {"kind": "svg", "path": "tinyml-1629.svg", "alt": "Pie chart showing 99.4% sleep time and 0.6% active time", "caption": "Duty cycle breakdown for 100uW budget"}, "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 5}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1630", "title": "Hailo-8 Video Inference Queue", "topic": "queueing-theory", "competency_area": "optimization", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the expected queueing delay for an incoming frame, and what strategy bounds average latency under 50ms?", "visual": {"kind": "svg", "path": "tinyml-1630.svg", "alt": "Hockey-stick curve showing latency exponentially rising as utilization approaches 1", "caption": "Latency vs Utilization Curve"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1631", "title": "BLE Mesh Bandwidth Collapse", "topic": "network-bandwidth-bottlenecks", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the bandwidth bottleneck, and can the current architecture support the data rate?", "visual": {"kind": "svg", "path": "tinyml-1631.svg", "alt": "8 nodes pointing to 1 central node", "caption": "8-to-1 Sensor Fanout Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1632", "title": "Solar Brownout Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the checkpointing strategy into NVRAM (FRAM) that minimizes write energy while ensuring the task completes across reboots?", "visual": {"kind": "svg", "path": "tinyml-1632.svg", "alt": "Timeline showing compute phase, checkpoint at 1s, brownout at 1.5s, and resumption", "caption": "Intermittent Power Execution Timeline"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1634", "title": "Smart-Ag Flash Wear", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Does a 5- or 15-minute checkpoint interval minimize 1-hour expected energy, and by what factor?", "visual": {"kind": "svg", "path": "tinyml-1634.svg", "alt": "Line plot showing expected rollback penalty dropping as checkpoint frequency increases", "caption": "Rollback Penalty vs Checkpoint Interval"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1637", "title": "Energy Harvesting State Loss", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze why the system makes zero progress and propose a non-volatile memory layout to guarantee eventual completion?", "visual": {"kind": "svg", "path": "tinyml-1637.svg", "alt": "Sawtooth timeline of execution progress dropping to 0 repeatedly", "caption": "Execution Progress without Checkpointing"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1639", "title": "LoRa Drone Swarm Consensus", "topic": "collective-communication", "competency_area": "parallelism", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a schedule that avoids packet collisions and compute the minimum consensus time?", "visual": {"kind": "svg", "path": "tinyml-1639.svg", "alt": "A star topology with Drone 0 as the central leader", "caption": "TDMA Star Consensus Topology"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1643", "title": "UART WiFi Bottleneck Analysis", "topic": "network-bandwidth-bottlenecks", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the bandwidth mismatch and justify the required data reduction ratio?", "visual": {"kind": "svg", "path": "tinyml-1643.svg", "alt": "10 BLE nodes mapping to an MCU, bottlenecking at a UART connection to WiFi.", "caption": "Bandwidth funneling from BLE nodes through UART bottleneck."}, "chain_ids": ["tinyml-chain-auto-secondary-017-50"], "chain_positions": {"tinyml-chain-auto-secondary-017-50": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-50": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1644", "title": "Audio Event Queue Overflow", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Why does the MCU buffer rapidly overflow when arrival rates exceed 95 events per second, and why is M/D/1 the correct model rather than M/M/1?", "visual": {"kind": "svg", "path": "tinyml-1644.svg", "alt": "Hockey-stick graph displaying expected buffer entries exploding as arrival rate approaches 100 events/sec.", "caption": "M/M/1 Queue Length vs Arrival Rate."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1646", "title": "MCU Weight Storage Hierarchy", "topic": "memory-hierarchy-design", "competency_area": "cross-cutting", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how this memory hierarchy strictly dictates the storage and execution strategy for the model's weights?", "visual": {"kind": "svg", "path": "tinyml-1646.svg", "alt": "Bar chart showing a 500KB model fitting in 1MB Flash but overflowing 256KB SRAM.", "caption": "Memory capacity vs model footprint."}, "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 1}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1652", "title": "BLE Mesh Federated Embedding Reduce", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total bytes transmitted per node for a 600-byte feature vector across the 3-node ring using the Ring AllReduce communication-cost formula, and why does Ring AllReduce win?", "visual": {"kind": "svg", "path": "tinyml-1652.svg", "alt": "Three node cyclic ring topology.", "caption": "Ring AllReduce logical topology for 3 nodes."}, "chain_ids": ["tinyml-chain-auto-secondary-016-20"], "chain_positions": {"tinyml-chain-auto-secondary-016-20": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-016-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1658", "title": "Cortex-M4 Quantization Memory Saturation", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the root cause of the 5x latency degradation when migrating from 8-bit to 16-bit quantization on a Cortex-M4?", "visual": {"kind": "svg", "path": "tinyml-1658.svg", "alt": "Bar chart showing INT8 well under the 256KB threshold and INT16 drastically over it.", "caption": "SRAM Capacity vs Model Quantization Footprint."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1659", "title": "Cortex-M4 Seismic Duty Cycle", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption if the MCU runs inference exactly once per second?", "visual": {"kind": "svg", "path": "tinyml-1659.svg", "alt": "A sleep/wake timeline showing 15mW spikes every 1 second over a near-zero baseline.", "caption": "Duty cycle power spikes across a 3-second window."}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1662", "title": "Cortex-M4 Cascade Wakeword", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the expected average power consumption of this cascade architecture?", "visual": {"kind": "svg", "path": "tinyml-1662.svg", "alt": "Sleep/wake timeline showing constant DSP baseline with occasional Neural Net spikes.", "caption": "Cascade architecture duty cycling."}, "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 0}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1664", "title": "Smart Doorbell MCU Queue Overflow and Tail-Drop Shedding Policy", "topic": "queueing-theory", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the maximum tolerable arrival rate before the queue length goes to infinity, and how do you design a shedding policy if arrivals burst to 100 frames per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1665", "title": "Energy Harvesting Checkpoints", "topic": "fault-tolerance-checkpointing", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What time-block checkpointing strategy to non-volatile memory maximizes forward progress while minimizing write overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1666", "title": "SPI/DMA Pipeline Overlap", "topic": "communication-computation-overlap", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can a dual-buffer DMA system overlap SPI sensor reads with DSP computations to ensure zero dropped samples?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 5}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1667", "title": "RTOS Preemption Deadline Miss", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "What is the root cause of the dropped frames, and what RTOS scheduling change fixes it?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1668", "title": "Frequency Scaling Duty Cycle", "topic": "duty-cycling", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design a duty-cycling schedule and optimize the clock frequency to minimize total energy consumed per hour?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1669", "title": "INT8 Cortex-M4 Footprint", "topic": "quantization-fundamentals", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the exact SRAM footprint reduction and qualitative execution impact of quantizing a 50K-parameter model to INT8 on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1670", "title": "Depthwise Convolutions on MCU", "topic": "compute-cost-estimation", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the total MAC count compared to a standard convolution, and what are the minimal clock cycles required assuming 1 MAC/cycle?", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1673", "title": "Differential LoRaWAN Updates", "topic": "model-serving-infrastructure", "competency_area": "cross-cutting", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Analyze the full payload transmission time and design a differential weight update protocol to minimize OTA time?", "chain_ids": ["tinyml-chain-auto-secondary-016-05"], "chain_positions": {"tinyml-chain-auto-secondary-016-05": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-016-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1674", "title": "RTC State Machine Duty Cycle", "topic": "duty-cycling", "competency_area": "cross-cutting", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "create", "status": "published", "phase": "both", "question": "How do you implement the state machine timing to maximize battery life, calculating the exact active duty cycle percentage?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1675", "title": "Asymmetric INT8 Calibration", "topic": "quantization-fundamentals", "competency_area": "cross-cutting", "track": "tinyml", "level": "L4", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the trade-off between using asymmetric Min-Max quantization versus symmetric calibration for preserving activation fidelity?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1676", "title": "Microcontroller Cycles and SRAM for Inference Cost", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total instruction cycles required for a single forward pass, specifying the SRAM footprint required for intermediate activations?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1678", "title": "Depthwise Separable Convolution Execution Cycles on Microcontrollers", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate why the actual inference latency on the Cortex-M4 only improved by 2.5x despite the 8x reduction in theoretical MAC operations?", "chain_ids": ["tinyml-chain-auto-secondary-004-03"], "chain_positions": {"tinyml-chain-auto-secondary-004-03": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-004-03": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1681", "title": "Cycle-Accurate Execution Cost Model for 1D Convolutions", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L6+", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an explicit cycle-accurate cost model for this 1D convolution and quantify the gap versus a naive 1.2x MAC heuristic?", "validated": false, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1682", "title": "Flash Memory Weight Placement for Bare-Metal Microcontrollers", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Specify how the microTVM compiler handles the model's weight initialization and memory layout in a bare-metal environment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1688", "title": "TinyML Memory Paging Analysis", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How can layer-wise memory overlay be implemented to execute this model within the SRAM limit?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 3}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1690", "title": "TinyML DMA Audio Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How can Direct Memory Access (DMA) be applied to overlap continuous audio data acquisition with the CPU's neural network inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1694", "title": "Calibration Bias in PTQ", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the potential causes of this accuracy drop and identify which quantization step likely introduced the bias?", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1695", "title": "Wake-Word Event Queue Stability Limit", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Specify the maximum arrival rate this wake-word system can handle before the event queue diverges to infinity, and calculate the queue utilization if events arrive at 10 per second?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1696", "title": "Intermittent Power Checkpoint Overhead", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "create", "status": "published", "phase": "both", "question": "Design an evaluation metric for the checkpointing overhead and calculate the percentage of time spent checkpointing if state is saved after every 30ms of active compute?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1703", "title": "MCU Minimum Theoretical Inference Latency", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply these hardware constraints to calculate the absolute minimum latency for a single inference, ignoring memory limits?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1714", "title": "Recalling Little's Law Stability Conditions for Buffers", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "State the condition under which Little's Law (L = lambda * W) holds true for this buffer system?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1715", "title": "Understanding Flash Wear Out in High Frequency Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why standard internal Flash memory might be unsuitable for this high-frequency checkpointing, and what alternative on-chip component is preferred?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1716", "title": "Throughput Impact of I/O Overlap on Microcontrollers", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L5", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the realized throughput gain from double-buffering, accounting for DMA bus contention, and identify whether the chip's single-bank SRAM bottlenecks the maximum overlap?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 4}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1718", "title": "XIP Execution and SRAM Limits in Microcontrollers", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain why copying weights to SRAM will cause an OOM error, and how the memory hierarchy should be mapped to succeed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1719", "title": "DMA Overlap to Meet Hard Real-Time Deadlines", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Can a single-threaded execution model sustain the 100Hz rate, and what is the CPU idle percentage if DMA is used?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 3}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1720", "title": "Diagnosing Heap Exhaustion in TF Lite Micro", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "What is the likely cause of this memory exhaustion, given that TFLM utilizes a static memory arena for model execution?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1721", "title": "Average Power Calculation in Duty-Cycled Sensors", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate average current including analog leakage, estimate the realistic battery lifetime accounting for the non-linear discharge curve, and identify the dominating loss term?", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 3}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1722", "title": "Missing Calibration in Post-Training Quantization", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "What critical quantization step was likely omitted or executed with unrepresentative data?", "chain_ids": ["tinyml-chain-auto-secondary-004-15"], "chain_positions": {"tinyml-chain-auto-secondary-004-15": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-15": "secondary"}, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1723", "title": "Triple-Buffering Pipeline Throughput Constraint", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a true triple-buffer scheme and compute the strict SRAM footprint needed, comparing it against ping-pong buffering?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1724", "title": "Wake-up Penalty Reduction via Batching", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how batching 10 inferences every 500 ms affects power efficiency AND determine which retention mode wins under both the unbatched (50 ms) and batched (500 ms) duty cycles?", "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 2}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1725", "title": "Cortex-M4 Inference Throughput", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "implement", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the sustained inferences per second assuming 50% MAC hardware utilization?", "chain_ids": ["tinyml-chain-auto-secondary-004-04"], "chain_positions": {"tinyml-chain-auto-secondary-004-04": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-004-04": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1726", "title": "Static Quantization Parameters", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Explain how post-training static quantization computes the specific integer parameters required to convert FP32 activations into INT8?", "chain_ids": ["tinyml-chain-auto-secondary-004-14"], "chain_positions": {"tinyml-chain-auto-secondary-004-14": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-004-14": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1728", "title": "Camera Trap State Machine", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "Describe the optimal hardware power state transitions to minimize energy per event?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1731", "title": "ARM SIMD DSP Extensions", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "recall", "bloom_level": "remember", "status": "published", "phase": "both", "question": "Recall the specific architectural instruction set feature used to accelerate quantized MACs on a Cortex-M4?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1732", "title": "DMA Audio Pipelining", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "fluency", "bloom_level": "understand", "status": "published", "phase": "both", "question": "How does ping-pong buffering permit continuous I2S DMA, and what pointer-swap logic prevents dropped audio frames?", "chain_ids": ["tinyml-chain-auto-secondary-016-21"], "chain_positions": {"tinyml-chain-auto-secondary-016-21": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-016-21": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1733", "title": "OTA Memory Alignment Faults", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the memory layout and identify the low-level addressing error crashing the CPU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1734", "title": "Evaluate Latency Bounds in Event-Driven Processing", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Can the MCU sustain this arrival rate without unbounded queue growth if a 2ms sleep-to-wake overhead is aggressively applied per event?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1735", "title": "Mitigating High Activation Memory in Deep Layers", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What memory management strategies can execute this specific bottleneck layer without exceeding the 256KB hardware SRAM limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1736", "title": "Design MAC Budget for Sub-50ms Wakeword Engine", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "create", "status": "published", "phase": "both", "question": "Create a theoretical compute budget in MACs for the neural network, assuming 4 MACs per SIMD instruction and 25% true pipeline utilization, to guarantee a sub-50ms inference time on an 80 MHz MCU?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1740", "title": "Analyze Checkpoint Overhead in Energy-Harvesting MCUs", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does varying the checkpoint frequency impact the total expected inference time under an unstable power environment?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1741", "title": "Diagnosing Quantization Collapse in Low-Amplitude Signals", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze how global symmetric quantization scales interact with high-variance input sensors, causing structural failure on subtle data patterns?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1742", "title": "Evaluate Sliding Window vs Quantized KV Cache", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the architectural trade-offs of using Sliding Window Attention versus INT8 KV Cache quantization to guarantee bounded SRAM usage without halting?", "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1744", "title": "Analyze Execute-In-Place Model Switching Latency", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L4", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "How does an Execute-In-Place (XIP) memory-mapped architecture reduce this model-switching bottleneck?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1745", "title": "Design a Sub-1mW Cascaded Acoustic Wake-up", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L6+", "zone": "optimization", "bloom_level": "create", "status": "published", "phase": "both", "question": "How can you design a multi-stage pipeline that guarantees the total average power stays below 1mW while processing sporadic acoustic events?", "chain_ids": ["tinyml-chain-auto-019-07"], "chain_positions": {"tinyml-chain-auto-019-07": 4}, "chain_tiers": {"tinyml-chain-auto-019-07": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1747", "title": "INT8 Quantization Memory Reduction", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the exact SRAM memory savings in bytes when quantizing this activation map from FP32 to INT8?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1750", "title": "Dense Layer MAC Count", "topic": "compute-cost-estimation", "competency_area": "compute", "track": "tinyml", "level": "L1", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total number of Multiply-Accumulate (MAC) operations required for a single forward pass of this layer?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1758", "title": "Evaluating INT16 Accumulation Trade-offs on Cortex-M4", "topic": "quantization-fundamentals", "competency_area": "precision", "track": "tinyml", "level": "L2", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What are the trade-offs of switching from INT32 to INT16 accumulation for intermediate dense layers to save memory?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1767", "title": "XIP Flash Memory for TinyML Models", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply a memory management strategy to seamlessly switch models without requiring a full device reboot or adding external RAM?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1769", "title": "In-Place Depth-First Execution on Cortex-M4", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L1", "zone": "optimization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Apply in-place depth-first execution to reduce the peak memory footprint below 64KB?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1770", "title": "Bare-Metal DMA Ring Buffers for TinyML", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "apply", "status": "published", "phase": "both", "question": "How would you design a bare-metal inference loop leveraging DMA to feed the RNN without CPU polling?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1773", "title": "Safe Checkpointing for Intermittent Power", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What safe checkpoint interval leaves time for a 0.1s NVM save within the 0.5s worst-case power window?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1778", "title": "Analyzing Queue Capacity During Traffic Bursts", "topic": "queueing-theory", "competency_area": "latency", "track": "tinyml", "level": "L4", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the utilization of the MCU and predict if the system can handle a temporary burst of 10 events/sec for 2 seconds?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1779", "title": "Applying Loop Tiling to SRAM Limits", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "fluency", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Which memory optimization technique can be applied to fit this layer's execution within the available 64KB limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1780", "title": "Evaluating OTA Payload Limits for Firmware", "topic": "model-serving-infrastructure", "competency_area": "deployment", "track": "tinyml", "level": "L5", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What are the trade-offs of A/B partitioned firmware updates versus dynamically loadable neural network blob updates?", "chain_ids": ["tinyml-chain-auto-secondary-016-05"], "chain_positions": {"tinyml-chain-auto-secondary-016-05": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-016-05": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1782", "title": "Analyzing SRAM Placement for Low-Latency Audio", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the optimal placement of the 16KB audio ring buffer within the Cortex-M4's memory hierarchy to minimize power consumption?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1783", "title": "Analyzing Watchdog Timers for Simple Recovery", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L2", "zone": "mastery", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the simplest fault-tolerance mechanism to recover from a brownout during the middle of the daily inference?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1788", "title": "SRAM vs Flash Layout", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Which memory placement strategy, including Flash XIP and SRAM use, keeps the model within SRAM limits?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1791", "title": "Intermittent Power Checkpointing", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Specify a non-volatile memory checkpointing routine that guarantees forward progress across brownouts?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1794", "title": "Microcontroller Event Power", "topic": "duty-cycling", "competency_area": "power", "track": "tinyml", "level": "L1", "zone": "specification", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the average power consumption of this duty-cycled system, assuming computation takes exactly 50ms per 2-second cycle?", "chain_ids": ["tinyml-chain-auto-019-06"], "chain_positions": {"tinyml-chain-auto-019-06": 0}, "chain_tiers": {"tinyml-chain-auto-019-06": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1796", "title": "Cortex-M4 SPI-Compute Overlap", "topic": "communication-computation-overlap", "competency_area": "optimization", "track": "tinyml", "level": "L2", "zone": "diagnosis", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the percentage reduction in per-inference latency if DMA is configured to completely overlap the BLE transmission with the next inference computation?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1797", "title": "Cortex-M4 Flash vs SRAM execution", "topic": "memory-hierarchy-design", "competency_area": "memory", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "What is the execution cycle difference between streaming directly from Flash for 5 passes versus copying the 100KB model to SRAM once?", "chain_ids": ["tinyml-chain-auto-026-01"], "chain_positions": {"tinyml-chain-auto-026-01": 2}, "chain_tiers": {"tinyml-chain-auto-026-01": "primary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1806", "title": "Power Loss Flash Write Margin Calculation", "topic": "fault-tolerance-checkpointing", "competency_area": "reliability", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Determine the safety margin in milliseconds remaining after the state is successfully flushed?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1808", "title": "Logical Ring on Physical Daisy-Chain", "topic": "collective-communication", "competency_area": "cross-cutting", "track": "tinyml", "level": "L5", "zone": "realization", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the synchronization stall time of a logical Ring AllReduce versus a bidirectional line-accumulator (Tree), accounting for a 10 microsecond SPI interrupt latency per step?", "chain_ids": ["tinyml-chain-auto-secondary-016-20"], "chain_positions": {"tinyml-chain-auto-secondary-016-20": 2}, "chain_tiers": {"tinyml-chain-auto-secondary-016-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1809", "title": "RP2040 Core FIFO Spinlock", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the pipeline latency per audio frame if the FIFO spinlock synchronization costs 100us?", "chain_ids": ["tinyml-chain-auto-secondary-017-34"], "chain_positions": {"tinyml-chain-auto-secondary-017-34": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1810", "title": "AHB Bus Interrupt Barrier", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Calculate the pipeline throughput assuming an AHB bus interrupt barrier requires 50us to clear?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1811", "title": "ESP-NOW Link Bubble", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "design", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the resulting pipeline bubble given the ESP-NOW MAC protocol imposes a 1.5ms synchronization latency per packet?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1812", "title": "SPI Daisy Chain Pipeline", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the end-to-end inference latency for a single sample, factoring in a 200us SPI DMA setup synchronization barrier at each hop?", "chain_ids": ["tinyml-chain-auto-secondary-017-35"], "chain_positions": {"tinyml-chain-auto-secondary-017-35": 0}, "chain_tiers": {"tinyml-chain-auto-secondary-017-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1813", "title": "Dual-Core RP2040 Pipeline Bubble Fraction", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the pipeline compute bubble fraction for Core 1 during steady-state processing?", "chain_ids": ["tinyml-chain-auto-secondary-017-34"], "chain_positions": {"tinyml-chain-auto-secondary-017-34": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-34": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1814", "title": "Asymmetric Cortex-M Pipeline Synchronization Overhead", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "Calculate the total latency for the 4-microbatch pipeline, including the pipeline flush and barrier synchronization overheads?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1816", "title": "SPI Double-Buffering Pipeline Barrier Computation", "topic": "pipeline-parallelism", "competency_area": "parallelism", "track": "tinyml", "level": "L3", "zone": "specification", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the maximum SPI payload size allowable to maintain the optimal 4ms pipeline throughput, assuming a 50us DMA setup barrier per transfer?", "chain_ids": ["tinyml-chain-auto-secondary-017-35"], "chain_positions": {"tinyml-chain-auto-secondary-017-35": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-017-35": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1817", "title": "Half-Duplex UART Ring AllReduce Diagnosis", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L4", "zone": "diagnosis", "bloom_level": "analyze", "status": "published", "phase": "both", "question": "Analyze the root cause of the degraded latency by computing the theoretical lower bound for the Ring AllReduce operation on this topology?", "chain_ids": ["tinyml-chain-auto-secondary-016-20"], "chain_positions": {"tinyml-chain-auto-secondary-016-20": 1}, "chain_tiers": {"tinyml-chain-auto-secondary-016-20": "secondary"}, "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1818", "title": "CAN Bus AllGather Broadcast Arbitration", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "analyze", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the total time required for an AllGather operation between the two nodes, including the bus framing overhead?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1819", "title": "Shared Medium RF Parameter Server Topology", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L4", "zone": "design", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate which topology minimizes total transmission time over the shared RF medium, calculating the minimum time for the optimal choice?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1821", "title": "BLE 5.0 Star Topology Collective Sync", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "evaluation", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "What is the total latency for an AllReduce equivalent operation where both followers send 350 KB to the leader, and the leader broadcasts the averaged 350 KB model back?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1822", "title": "LoRa Mesh Duty Cycle Pipeline Impact", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "realization", "bloom_level": "apply", "status": "published", "phase": "both", "question": "What is the absolute minimum elapsed time for a single node to complete its AllReduce portion, factoring in the duty-cycle stall?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}, {"id": "tinyml-1823", "title": "Wi-Fi 6 CSMA Gossip Protocol Congestion", "topic": "collective-communication", "competency_area": "networking", "track": "tinyml", "level": "L3", "zone": "mastery", "bloom_level": "evaluate", "status": "published", "phase": "both", "question": "Evaluate the total expected time to reach 99% weight propagation across the network, accounting for the CSMA-enforced concurrency limit?", "validated": true, "math_verified": true, "scenario": "", "details": {"common_mistake": "", "realistic_solution": "", "napkin_math": ""}}]
diff --git a/interviews/staffml/src/data/vault-manifest.json b/interviews/staffml/src/data/vault-manifest.json
index 0ed7f7465..019525740 100644
--- a/interviews/staffml/src/data/vault-manifest.json
+++ b/interviews/staffml/src/data/vault-manifest.json
@@ -1,26 +1,26 @@
 {
   "releaseId": "dev",
-  "releaseHash": "d514a795f68e5c76d8bc7242783fac27d4017e8c0561a09edd75c3dd6620c7db",
+  "releaseHash": "2ab29a09eb919139616b11e5c3b9d843dd9520ec155f6a23a1261f719441f933",
   "schemaVersion": "1",
   "policyVersion": "1",
-  "buildDate": "2026-05-04T12:49:59Z",
-  "questionCount": 9446,
+  "buildDate": "2026-05-05T13:28:43Z",
+  "questionCount": 9521,
   "chainCount": 843,
   "conceptCount": 87,
   "trackDistribution": {
-    "cloud": 4028,
-    "edge": 2079,
-    "global": 313,
-    "mobile": 1824,
-    "tinyml": 1202
+    "cloud": 4077,
+    "edge": 2093,
+    "global": 317,
+    "mobile": 1826,
+    "tinyml": 1208
   },
   "levelDistribution": {
-    "L4": 2570,
-    "L1": 534,
-    "L2": 1043,
-    "L3": 2347,
-    "L5": 2140,
-    "L6+": 812
+    "L4": 2591,
+    "L1": 543,
+    "L2": 1053,
+    "L3": 2360,
+    "L5": 2157,
+    "L6+": 817
   },
   "areaCount": 13,
   "taxonomyVersion": "87-topics"
diff --git a/interviews/staffml/src/lib/corpus.ts b/interviews/staffml/src/lib/corpus.ts
index f1a0fa20f..9c185c52a 100644
--- a/interviews/staffml/src/lib/corpus.ts
+++ b/interviews/staffml/src/lib/corpus.ts
@@ -521,18 +521,20 @@ function shouldUseStaticDetails(): boolean {
 
 async function getStaticFullDetail(id: string, summary: Question): Promise<Question | undefined> {
   if (!_staticDetailsCache) {
-    // Function-constructor dynamic import: hides the path from Turbopack's
-    // static analyzer so prod builds don't require corpus.json to exist.
-    // corpus.json is materialized on disk only when a contributor runs
-    // `vault build --local-json` locally with NEXT_PUBLIC_VAULT_FALLBACK=
-    // static. If the file is missing at runtime, the import rejects and
-    // the caller surfaces an error to the UI.
-    const dynImport = new Function(
-      "p",
-      "return import(p)",
-    ) as (p: string) => Promise<{ default: Question[] }>;
-    const mod = await dynImport("../data/corpus.json");
-    _staticDetailsCache = new Map(mod.default.map((q) => [q.id, q]));
+    // Fetch corpus.json from /data/corpus.json (served from public/). This
+    // file is written by `vault build --local-json` and exists only in local
+    // dev. Production deploys neither emit nor bundle it; the worker fetch
+    // path handles those. If the file is missing at runtime the fetch fails
+    // and the caller surfaces an error to the UI.
+    const res = await fetch("/data/corpus.json");
+    if (!res.ok) {
+      throw new Error(
+        `Static corpus.json not available at /data/corpus.json (status ${res.status}). ` +
+        "Run \`vault build --local-json\` from the repo root to regenerate it.",
+      );
+    }
+    const data = (await res.json()) as Question[];
+    _staticDetailsCache = new Map(data.map((q) => [q.id, q]));
   }
   const full = _staticDetailsCache.get(id);
   if (!full) return undefined;
diff --git a/interviews/vault-cli/src/vault_cli/commands/build.py b/interviews/vault-cli/src/vault_cli/commands/build.py
index c17d0513f..274804521 100644
--- a/interviews/vault-cli/src/vault_cli/commands/build.py
+++ b/interviews/vault-cli/src/vault_cli/commands/build.py
@@ -36,11 +36,15 @@ def register(app: typer.Typer) -> None:
         local_json: bool = typer.Option(
             False,
             "--local-json",
-            help="Also write a site-readable corpus.json at "
-                 "interviews/staffml/src/data/corpus.json so the StaffML "
-                 "frontend can serve full question content from disk during "
-                 "local dev (with NEXT_PUBLIC_VAULT_FALLBACK=static). "
-                 "Production never reads this file; it is dev-only.",
+            "--local",
+            help="Materialize the local-dev artifacts so the StaffML frontend "
+                 "can serve full question content from disk: writes "
+                 "interviews/staffml/src/data/corpus.json AND mirrors it to "
+                 "interviews/staffml/public/data/corpus.json (the path the "
+                 "Next.js loader actually fetches with "
+                 "NEXT_PUBLIC_VAULT_FALLBACK=static). Production never reads "
+                 "either file; this is dev-only. The shorter --local alias "
+                 "is preferred.",
         ),
     ) -> None:
         """Compile all YAML questions under vault/questions/ to a SQLite file.
@@ -73,6 +77,19 @@ def register(app: typer.Typer) -> None:
                 f"[dim]local corpus.json: {local_result['count']} questions → "
                 f"{local_result['output']}[/dim]"
             )
+            # Mirror corpus.json into public/data/ so Next can serve it as a
+            # static asset. The frontend's getStaticFullDetail() fetches
+            # /data/corpus.json (set NEXT_PUBLIC_VAULT_FALLBACK=static to
+            # opt in) — Turbopack does not bundle the src/data/ copy because
+            # it would balloon the prod bundle, so the public mirror is the
+            # only reliable runtime path in local dev.
+            public_out = Path("interviews/staffml/public/data/corpus.json")
+            public_out.parent.mkdir(parents=True, exist_ok=True)
+            public_out.write_bytes(local_out.read_bytes())
+            console.print(
+                f"[dim]public mirror:    {local_result['count']} questions → "
+                f"{public_out}[/dim]"
+            )
             # Mirror visual assets alongside the JSON. The frontend
             # references /question-visuals/<track>/<file>.svg directly
             # from Next.js's public/ tree — no hydration or worker